Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mode1o.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P6: 10.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor, mp_limb_t carry);
32: C
33: C It's not worth skipping a step at the end when high<divisor since the main
34: C loop is only 10 cycles.
35:
36: defframe(PARAM_CARRY, 16)
37: defframe(PARAM_DIVISOR,12)
38: defframe(PARAM_SIZE, 8)
39: defframe(PARAM_SRC, 4)
40:
41: dnl Not enough room under modexact_1 to make these re-use the parameter
42: dnl space, unfortunately.
43: defframe(SAVE_EBX, -4)
44: defframe(SAVE_ESI, -8)
45: defframe(SAVE_EDI, -12)
46: deflit(STACK_SPACE, 12)
47:
48: TEXT
49:
50: ALIGN(16)
51: PROLOGUE(mpn_modexact_1c_odd)
52: deflit(`FRAME',0)
53:
54: movl PARAM_CARRY, %ecx
55: jmp L(start_1c)
56:
57: EPILOGUE()
58:
59: ALIGN(16)
60: PROLOGUE(mpn_modexact_1_odd)
61: deflit(`FRAME',0)
62:
63: xorl %ecx, %ecx
64: L(start_1c):
65: movl PARAM_DIVISOR, %eax
66:
67: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
68:
69: movl %esi, SAVE_ESI
70: movl PARAM_SRC, %esi
71:
72: shrl %eax C d/2
73: movl %edi, SAVE_EDI
74:
75: andl $127, %eax
76:
77: ifdef(`PIC',`
78: call L(movl_eip_edi)
79: addl $_GLOBAL_OFFSET_TABLE_, %edi
80: movl modlimb_invert_table@GOT(%edi), %edi
81: movzbl (%eax,%edi), %edi C inv 8 bits
82:
83: ',`
84: dnl non-PIC
85: movzbl modlimb_invert_table(%eax), %edi C inv 8 bits
86: ')
87:
88: xorl %edx, %edx C initial extra carry
89: leal (%edi,%edi), %eax C 2*inv
90:
91: imull %edi, %edi C inv*inv
92:
93: movl %ebx, SAVE_EBX
94: movl PARAM_SIZE, %ebx
95:
96: imull PARAM_DIVISOR, %edi C inv*inv*d
97:
98: subl %edi, %eax C inv = 2*inv - inv*inv*d
99: leal (%eax,%eax), %edi C 2*inv
100:
101: imull %eax, %eax C inv*inv
102:
103: imull PARAM_DIVISOR, %eax C inv*inv*d
104:
105: leal (%esi,%ebx,4), %esi C src end
106: negl %ebx C -size
107:
108: subl %eax, %edi C inv = 2*inv - inv*inv*d
109:
110: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
111: movl PARAM_DIVISOR, %eax
112: imull %edi, %eax
113: cmpl $1, %eax')
114:
115:
116: C The dependent chain here is
117: C
118: C subl %edx, %eax 1
119: C imull %edi, %eax 4
120: C mull PARAM_DIVISOR 5
121: C ----
122: C total 10
123: C
124: C and this is the measured speed. No special scheduling is necessary, out
125: C of order execution hides the load latency.
126:
127: L(top):
128: C eax scratch (src limb)
129: C ebx counter, limbs, negative
130: C ecx carry bit, 0 or 1
131: C edx carry limb, high of last product
132: C esi &src[size]
133: C edi inverse
134: C ebp
135:
136: movl (%esi,%ebx,4), %eax
137: subl %ecx, %eax
138:
139: sbbl %ecx, %ecx
140: subl %edx, %eax
141:
142: sbbl $0, %ecx
143:
144: imull %edi, %eax
145:
146: negl %ecx
147:
148: mull PARAM_DIVISOR
149:
150: incl %ebx
151: jnz L(top)
152:
153:
154: movl SAVE_ESI, %esi
155: leal (%ecx,%edx), %eax
156:
157: movl SAVE_EDI, %edi
158:
159: movl SAVE_EBX, %ebx
160: addl $STACK_SPACE, %esp
161:
162: ret
163:
164:
165: ifdef(`PIC',`
166: L(movl_eip_edi):
167: movl (%esp), %edi
168: ret
169: ')
170:
171: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>