Annotation of OpenXM_contrib/gmp/mpn/x86/p6/dive_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C odd even divisor
26: C P6: 10.0 12.0 cycles/limb
27:
28:
29: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
30: C mp_limb_t divisor);
31: C
32: C The odd case is basically the same as mpn_modexact_1_odd, just with an
33: C extra store, and it runs at the same 10 cycles which is the dependent
34: C chain.
35: C
36: C The shifts for the even case aren't on the dependent chain so in principle
37: C it could run the same too, but nothing running at 10 has been found.
38: C Perhaps there's too many uops (an extra 4 over the odd case).
39:
40: defframe(PARAM_DIVISOR,16)
41: defframe(PARAM_SIZE, 12)
42: defframe(PARAM_SRC, 8)
43: defframe(PARAM_DST, 4)
44:
45: defframe(SAVE_EBX, -4)
46: defframe(SAVE_ESI, -8)
47: defframe(SAVE_EDI, -12)
48: defframe(SAVE_EBP, -16)
49: defframe(VAR_INVERSE, -20)
50: deflit(STACK_SPACE, 20)
51:
52: TEXT
53:
54: ALIGN(16)
55: PROLOGUE(mpn_divexact_1)
56: deflit(`FRAME',0)
57:
58: movl PARAM_DIVISOR, %eax
59: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
60:
61: movl %esi, SAVE_ESI
62: movl PARAM_SRC, %esi
63:
64: movl %ebx, SAVE_EBX
65: movl PARAM_SIZE, %ebx
66:
67: bsfl %eax, %ecx C trailing twos
68:
69: movl %ebp, SAVE_EBP
70:
71: shrl %cl, %eax C d without twos
72:
73: movl %eax, %edx
74: shrl %eax C d/2 without twos
75:
76: movl %edx, PARAM_DIVISOR
77: andl $127, %eax
78:
79: ifdef(`PIC',`
80: call L(movl_eip_ebp)
81: addl $_GLOBAL_OFFSET_TABLE_, %ebp
82: movl modlimb_invert_table@GOT(%ebp), %ebp
83: movzbl (%eax,%ebp), %ebp C inv 8 bits
84:
85: ',`
86: dnl non-PIC
87: movzbl modlimb_invert_table(%eax), %ebp C inv 8 bits
88: ')
89:
90: leal (%ebp,%ebp), %eax C 2*inv
91:
92: imull %ebp, %ebp C inv*inv
93:
94: movl %edi, SAVE_EDI
95: movl PARAM_DST, %edi
96:
97: leal (%esi,%ebx,4), %esi C src end
98:
99: imull PARAM_DIVISOR, %ebp C inv*inv*d
100:
101: subl %ebp, %eax C inv = 2*inv - inv*inv*d
102: leal (%eax,%eax), %ebp C 2*inv
103:
104: imull %eax, %eax C inv*inv
105:
106: leal (%edi,%ebx,4), %edi C dst end
107: negl %ebx C -size
108:
109: movl %edi, PARAM_DST
110:
111: imull PARAM_DIVISOR, %eax C inv*inv*d
112:
113: subl %eax, %ebp C inv = 2*inv - inv*inv*d
114:
115: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
116: movl PARAM_DIVISOR, %eax
117: imull %ebp, %eax
118: cmpl $1, %eax')
119:
120: movl %ebp, VAR_INVERSE
121: movl (%esi,%ebx,4), %eax C src[0]
122:
123: orl %ecx, %ecx
124: jnz L(even)
125:
126: C ecx initial carry is zero
127: jmp L(odd_entry)
128:
129:
130: C The dependent chain here is
131: C
132: C subl %edx, %eax 1
133: C imull %ebp, %eax 4
134: C mull PARAM_DIVISOR 5
135: C ----
136: C total 10
137: C
138: C and this is the measured speed. No special scheduling is necessary, out
139: C of order execution hides the load latency.
140:
141: L(odd_top):
142: C eax scratch (src limb)
143: C ebx counter, limbs, negative
144: C ecx carry bit
145: C edx carry limb, high of last product
146: C esi &src[size]
147: C edi &dst[size]
148: C ebp
149:
150: mull PARAM_DIVISOR
151:
152: movl (%esi,%ebx,4), %eax
153: subl %ecx, %eax
154:
155: sbbl %ecx, %ecx
156: subl %edx, %eax
157:
158: sbbl $0, %ecx
159:
160: L(odd_entry):
161: imull VAR_INVERSE, %eax
162:
163: movl %eax, (%edi,%ebx,4)
164: negl %ecx
165:
166: incl %ebx
167: jnz L(odd_top)
168:
169:
170: movl SAVE_ESI, %esi
171:
172: movl SAVE_EDI, %edi
173:
174: movl SAVE_EBP, %ebp
175:
176: movl SAVE_EBX, %ebx
177: addl $STACK_SPACE, %esp
178:
179: ret
180:
181:
182: L(even):
183: C eax src[0]
184: C ebx counter, limbs, negative
185: C ecx shift
186: C edx
187: C esi
188: C edi
189: C ebp
190:
191: xorl %ebp, %ebp C initial carry bit
192: xorl %edx, %edx C initial carry limb (for size==1)
193:
194: incl %ebx
195: jz L(even_one)
196:
197: movl (%esi,%ebx,4), %edi C src[1]
198:
199: shrdl( %cl, %edi, %eax)
200:
201: jmp L(even_entry)
202:
203:
204: L(even_top):
205: C eax scratch
206: C ebx counter, limbs, negative
207: C ecx shift
208: C edx scratch
209: C esi &src[size]
210: C edi &dst[size] and scratch
211: C ebp carry bit
212:
213: movl (%esi,%ebx,4), %edi
214:
215: mull PARAM_DIVISOR
216:
217: movl -4(%esi,%ebx,4), %eax
218: shrdl( %cl, %edi, %eax)
219:
220: subl %ebp, %eax
221:
222: sbbl %ebp, %ebp
223: subl %edx, %eax
224:
225: sbbl $0, %ebp
226:
227: L(even_entry):
228: imull VAR_INVERSE, %eax
229:
230: movl PARAM_DST, %edi
231: negl %ebp
232:
233: movl %eax, -4(%edi,%ebx,4)
234: incl %ebx
235: jnz L(even_top)
236:
237:
238:
239: mull PARAM_DIVISOR
240:
241: movl -4(%esi), %eax
242:
243: L(even_one):
244: shrl %cl, %eax
245: movl SAVE_ESI, %esi
246:
247: subl %ebp, %eax
248: movl SAVE_EBP, %ebp
249:
250: subl %edx, %eax
251: movl SAVE_EBX, %ebx
252:
253: imull VAR_INVERSE, %eax
254:
255: movl %eax, -4(%edi)
256: movl SAVE_EDI, %edi
257: addl $STACK_SPACE, %esp
258:
259: ret
260:
261:
262: ifdef(`PIC',`
263: L(movl_eip_ebp):
264: movl (%esp), %ebp
265: ret
266: ')
267:
268: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>