Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mode1o.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P5: 23.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor, mp_limb_t carry);
32: C
33: C There seems no way to pair up the two lone instructions in the main loop.
34: C
35: C The special case for size==1 saves about 20 cycles (non-PIC), making it
36: C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
37: C all sizes.
38: C
39: C Alternatives:
40: C
41: C Using mmx for the multiplies might be possible, with pmullw and pmulhw
42: C having just 3 cycle latencies, but carry bit handling would probably be
43: C complicated.
44:
45: defframe(PARAM_CARRY, 16)
46: defframe(PARAM_DIVISOR,12)
47: defframe(PARAM_SIZE, 8)
48: defframe(PARAM_SRC, 4)
49:
50: dnl re-using parameter space
51: define(VAR_INVERSE,`PARAM_SIZE')
52:
53: TEXT
54:
55: ALIGN(16)
56: PROLOGUE(mpn_modexact_1c_odd)
57: deflit(`FRAME',0)
58:
59: movl PARAM_DIVISOR, %eax
60: movl PARAM_CARRY, %edx
61:
62: jmp L(start_1c)
63:
64: EPILOGUE()
65:
66: ALIGN(16)
67: PROLOGUE(mpn_modexact_1_odd)
68: deflit(`FRAME',0)
69:
70: movl PARAM_DIVISOR, %eax
71: xorl %edx, %edx C carry
72:
73: L(start_1c):
74:
75: ifdef(`PIC',`
76: call L(here) FRAME_pushl()
77: L(here):
78:
79: shrl %eax C d/2
80: movl (%esp), %ecx C eip
81:
82: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
83: movl %ebx, (%esp) C push ebx
84:
85: andl $127, %eax
86: movl PARAM_SIZE, %ebx
87:
88: movl modlimb_invert_table@GOT(%ecx), %ecx
89: subl $2, %ebx
90:
91: movb (%eax,%ecx), %cl C inv 8 bits
92: jc L(one_limb)
93:
94: ',`
95: dnl non-PIC
96: shrl %eax C d/2
97: pushl %ebx FRAME_pushl()
98:
99: movl PARAM_SIZE, %ebx
100: andl $127, %eax
101:
102: subl $2, %ebx
103: jc L(one_limb)
104:
105: movb modlimb_invert_table(%eax), %cl C inv 8 bits
106: ')
107:
108: movl %ecx, %eax
109: addl %ecx, %ecx C 2*inv
110:
111: imull %eax, %eax C inv*inv
112:
113: imull PARAM_DIVISOR, %eax C inv*inv*d
114:
115: subl %eax, %ecx C inv = 2*inv - inv*inv*d
116:
117: movl %ecx, %eax
118: addl %ecx, %ecx C 2*inv
119:
120: imull %eax, %eax C inv*inv
121:
122: imull PARAM_DIVISOR, %eax C inv*inv*d
123:
124: subl %eax, %ecx C inv = 2*inv - inv*inv*d
125: pushl %esi FRAME_pushl()
126:
127: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
128: movl %ecx, %eax
129: imull PARAM_DIVISOR, %eax
130: cmpl $1, %eax')
131:
132: movl PARAM_SRC, %esi
133: movl %ecx, VAR_INVERSE
134:
135: movl (%esi), %eax C src[0]
136: leal 4(%esi,%ebx,4), %esi C &src[size-1]
137:
138: xorl $-1, %ebx C -(size-1)
139: ASSERT(nz)
140: jmp L(entry)
141:
142:
143: C The use of VAR_INVERSE means only a store is needed for that value, rather
144: C than a push and pop of say %edi.
145:
146: ALIGN(16)
147: L(top):
148: C eax scratch, low product
149: C ebx counter, limbs, negative
150: C ecx carry bit
151: C edx scratch, high product
152: C esi &src[size-1]
153: C edi
154: C ebp
155:
156: mull PARAM_DIVISOR C h:dummy = q*d
157:
158: movl (%esi,%ebx,4), %eax C src[i]
159: subl %ecx, %edx C h -= -c
160:
161: L(entry):
162: subl %edx, %eax C s = src[i] - h
163:
164: sbbl %ecx, %ecx C new -c (0 or -1)
165:
166: imull VAR_INVERSE, %eax C q = s*i
167:
168: incl %ebx
169: jnz L(top)
170:
171:
172: mull PARAM_DIVISOR
173:
174: movl (%esi), %eax C src high
175: subl %ecx, %edx C h -= -c
176:
177: cmpl PARAM_DIVISOR, %eax
178:
179: jbe L(skip_last)
180: deflit(FRAME_LAST,FRAME)
181:
182:
183: subl %edx, %eax C s = src[i] - h
184: popl %esi FRAME_popl()
185:
186: sbbl %ecx, %ecx C c (0 or -1)
187: popl %ebx FRAME_popl()
188:
189: imull VAR_INVERSE, %eax C q = s*i
190:
191: mull PARAM_DIVISOR C h:dummy = q*d
192:
193: movl %edx, %eax
194:
195: subl %ecx, %eax
196:
197: ret
198:
199:
200: C When high<divisor can skip last step.
201:
202: L(skip_last):
203: deflit(`FRAME',FRAME_LAST)
204: C eax src high
205: C ebx
206: C ecx
207: C edx r
208: C esi
209:
210: subl %eax, %edx C r-s
211: popl %esi FRAME_popl()
212:
213: sbbl %eax, %eax C -1 if underflow
214: movl PARAM_DIVISOR, %ebx
215:
216: andl %ebx, %eax C divisor if underflow
217: popl %ebx FRAME_popl()
218:
219: addl %edx, %eax C addback if underflow
220:
221: ret
222:
223:
224: C Special case for size==1 using a division for r = c-a mod d.
225: C Could look for a-c<d and save a division sometimes, but that doesn't seem
226: C worth bothering about.
227:
228: L(one_limb):
229: deflit(`FRAME',4)
230: C eax
231: C ebx size-2 (==-1)
232: C ecx
233: C edx carry
234: C esi src end
235: C edi
236: C ebp
237:
238: movl %edx, %eax
239: movl PARAM_SRC, %edx
240:
241: movl PARAM_DIVISOR, %ecx
242: popl %ebx FRAME_popl()
243:
244: subl (%edx), %eax C c-a
245:
246: sbbl %edx, %edx
247: decl %ecx C d-1
248:
249: andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
250:
251: divl PARAM_DIVISOR
252:
253: movl %edx, %eax
254:
255: ret
256:
257: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>