Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/dive_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C divisor
26: C odd even
27: C K6: 10.0 12.0 cycles/limb
28: C K6-2: 10.0 11.5
29:
30:
31: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
32: C mp_limb_t divisor);
33: C
34: C A simple divl is used for size==1. This is about 10 cycles faster for an
35: C odd divisor or 20 cycles for an even divisor.
36: C
37: C The loops are quite sensitive to code alignment, speeds should be
38: C rechecked (odd and even divisor, pic and non-pic) if contemplating
39: C changing anything.
40:
41: defframe(PARAM_DIVISOR,16)
42: defframe(PARAM_SIZE, 12)
43: defframe(PARAM_SRC, 8)
44: defframe(PARAM_DST, 4)
45:
46: dnl re-use parameter space
47: define(VAR_INVERSE,`PARAM_DST')
48:
49: TEXT
50:
51: ALIGN(32)
52: PROLOGUE(mpn_divexact_1)
53: deflit(`FRAME',0)
54:
55: movl PARAM_SIZE, %ecx
56:
57: movl PARAM_SRC, %eax
58: xorl %edx, %edx
59:
60: cmpl $1, %ecx
61: jnz L(two_or_more)
62:
63: movl (%eax), %eax
64:
65: divl PARAM_DIVISOR
66:
67: movl PARAM_DST, %ecx
68: movl %eax, (%ecx)
69:
70: ret
71:
72:
73: L(two_or_more):
74: movl PARAM_DIVISOR, %eax
75: pushl %ebx FRAME_pushl()
76:
77: movl PARAM_SRC, %ebx
78: pushl %ebp FRAME_pushl()
79:
80: L(strip_twos):
81: shrl %eax
82: incl %edx C will get shift+1
83:
84: jnc L(strip_twos)
85: pushl %esi FRAME_pushl()
86:
87: leal 1(%eax,%eax), %esi C d without twos
88: andl $127, %eax C d/2, 7 bits
89:
90: ifdef(`PIC',`
91: call L(movl_eip_ebp)
92:
93: addl $_GLOBAL_OFFSET_TABLE_, %ebp
94: C
95: movl modlimb_invert_table@GOT(%ebp), %ebp
96: C
97: Zdisp( movzbl, 0,(%eax,%ebp), %eax)
98: ',`
99:
100: dnl non-PIC
101: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
102: ')
103: pushl %edi FRAME_pushl()
104:
105: leal (%eax,%eax), %ebp C 2*inv
106:
107: imull %eax, %eax C inv*inv
108:
109: movl PARAM_DST, %edi
110:
111: imull %esi, %eax C inv*inv*d
112:
113: subl %eax, %ebp C inv = 2*inv - inv*inv*d
114: leal (%ebp,%ebp), %eax C 2*inv
115:
116: imull %ebp, %ebp C inv*inv
117:
118: movl %esi, PARAM_DIVISOR C d without twos
119: leal (%ebx,%ecx,4), %ebx C src end
120:
121: imull %esi, %ebp C inv*inv*d
122:
123: leal (%edi,%ecx,4), %edi C dst end
124: negl %ecx C -size
125:
126: subl %ebp, %eax C inv = 2*inv - inv*inv*d
127: subl $1, %edx C shift amount, and clear carry
128:
129: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
130: pushl %eax FRAME_pushl()
131: imull PARAM_DIVISOR, %eax
132: cmpl $1, %eax
133: popl %eax FRAME_popl()')
134:
135: movl %eax, VAR_INVERSE
136: jnz L(even)
137:
138: movl (%ebx,%ecx,4), %esi C src low limb
139: jmp L(odd_entry)
140:
141:
142: ALIGN(16)
143: nop C code alignment
144: L(odd_top):
145: C eax scratch
146: C ebx src end
147: C ecx counter, limbs, negative
148: C edx inverse
149: C esi next limb, adjusted for carry
150: C edi dst end
151: C ebp carry bit, 0 or -1
152:
153: imull %edx, %esi
154:
155: movl PARAM_DIVISOR, %eax
156: movl %esi, -4(%edi,%ecx,4)
157:
158: mull %esi C carry limb in edx
159:
160: subl %ebp, %edx C apply carry bit
161: movl (%ebx,%ecx,4), %esi
162:
163: L(odd_entry):
164: subl %edx, %esi C apply carry limb
165: movl VAR_INVERSE, %edx
166:
167: sbbl %ebp, %ebp C 0 or -1
168:
169: incl %ecx
170: jnz L(odd_top)
171:
172:
173: imull %edx, %esi
174:
175: movl %esi, -4(%edi,%ecx,4)
176:
177: popl %edi
178: popl %esi
179:
180: popl %ebp
181: popl %ebx
182:
183: ret
184:
185:
186: ifdef(`PIC',`
187: L(movl_eip_ebp):
188: movl (%esp), %ebp
189: ret
190:
191: ALIGN(8)
192: nop C code alignment, necessary for claimed speed
193: nop
194: ',`
195: C non-PIC code alignment already ok at 0x9a
196: ')
197:
198: L(even):
199: C eax
200: C ebx src end
201: C ecx -size
202: C edx twos
203: C esi
204: C edi dst end
205: C ebp
206:
207: xorl %ebp, %ebp
208: Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
209:
210: movd %edx, %mm7
211: movl VAR_INVERSE, %edx
212:
213: addl $2, %ecx
214: psrlq %mm7, %mm0
215:
216: movd %mm0, %esi
217: jz L(even_two) C if only two limbs
218:
219:
220: C Out-of-order execution is good enough to hide the load/rshift/movd
221: C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
222: C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
223: C been found. Maybe the fact every second movq is unaligned costs the extra
224: C 0.5.
225:
226: L(even_top):
227: C eax scratch
228: C ebx src end
229: C ecx counter, limbs, negative
230: C edx inverse
231: C esi next limb, adjusted for carry
232: C edi dst end
233: C ebp carry bit, 0 or -1
234: C
235: C mm0 scratch, source limbs
236: C mm7 twos
237:
238: imull %edx, %esi
239:
240: movl %esi, -8(%edi,%ecx,4)
241: movl PARAM_DIVISOR, %eax
242:
243: mull %esi C carry limb in edx
244:
245: movq -4(%ebx,%ecx,4), %mm0
246: psrlq %mm7, %mm0
247:
248: movd %mm0, %esi
249: subl %ebp, %edx C apply carry bit
250:
251: subl %edx, %esi C apply carry limb
252: movl VAR_INVERSE, %edx
253:
254: sbbl %ebp, %ebp C 0 or -1
255:
256: incl %ecx
257: jnz L(even_top)
258:
259:
260: L(even_two):
261: movd -4(%ebx), %mm0 C src high limb
262: psrlq %mm7, %mm0
263:
264: imull %edx, %esi
265:
266: movl %esi, -8(%edi)
267: movl PARAM_DIVISOR, %eax
268:
269: mull %esi C carry limb in edx
270:
271: movd %mm0, %esi
272: subl %ebp, %edx C apply carry bit
273:
274: movl VAR_INVERSE, %eax
275: subl %edx, %esi C apply carry limb
276:
277: imull %eax, %esi
278:
279: movl %esi, -4(%edi)
280:
281: popl %edi
282: popl %esi
283:
284: popl %ebp
285: popl %ebx
286:
287: emms_or_femms
288:
289: ret
290:
291: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>