Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mode1o.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K7: 11.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31: C mp_limb_t divisor, mp_limb_t carry);
32: C
33: C With the loop running at just 11 cycles it doesn't seem worth bothering to
34: C check for high<divisor to save one step.
35: C
36: C Using a divl for size==1 measures slower than the modexact method, which
37: C is not too surprising since for the latter it's only about 24 cycles to
38: C calculate the modular inverse.
39:
40: defframe(PARAM_CARRY, 16)
41: defframe(PARAM_DIVISOR,12)
42: defframe(PARAM_SIZE, 8)
43: defframe(PARAM_SRC, 4)
44:
45: defframe(SAVE_EBX, -4)
46: defframe(SAVE_ESI, -8)
47: defframe(SAVE_EDI, -12)
48: defframe(SAVE_EBP, -16)
49:
50: deflit(STACK_SPACE, 16)
51:
52: TEXT
53:
54: ALIGN(16)
55: PROLOGUE(mpn_modexact_1c_odd)
56: deflit(`FRAME',0)
57:
58: movl PARAM_CARRY, %ecx
59: jmp L(start_1c)
60:
61:
62: ifdef(`PIC',`
63: L(movl_eip_edi):
64: movl (%esp), %edi
65: ret
66: ')
67:
68: EPILOGUE()
69:
70:
71: ALIGN(16)
72: PROLOGUE(mpn_modexact_1_odd)
73: deflit(`FRAME',0)
74:
75: xorl %ecx, %ecx
76: L(start_1c):
77: movl PARAM_DIVISOR, %eax
78: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
79:
80: movl %esi, SAVE_ESI
81: movl PARAM_DIVISOR, %esi
82:
83: movl %edi, SAVE_EDI
84:
85: shrl %eax C d/2
86:
87: andl $127, %eax
88:
89: ifdef(`PIC',`
90: call L(movl_eip_edi)
91: addl $_GLOBAL_OFFSET_TABLE_, %edi
92: movl modlimb_invert_table@GOT(%edi), %edi
93: movzbl (%eax,%edi), %edi C inv 8 bits
94:
95: ',`
96: dnl non-PIC
97: movzbl modlimb_invert_table(%eax), %edi C inv 8 bits
98: ')
99:
100: xorl %edx, %edx C initial extra carry
101: leal (%edi,%edi), %eax C 2*inv
102:
103: imull %edi, %edi C inv*inv
104:
105: movl %ebp, SAVE_EBP
106: movl PARAM_SIZE, %ebp
107:
108: movl %ebx, SAVE_EBX
109: movl PARAM_SRC, %ebx
110:
111: imull %esi, %edi C inv*inv*d
112:
113: subl %edi, %eax C inv = 2*inv - inv*inv*d
114: leal (%eax,%eax), %edi C 2*inv
115:
116: imull %eax, %eax C inv*inv
117:
118: imull %esi, %eax C inv*inv*d
119:
120: leal (%ebx,%ebp,4), %ebx C src end
121: negl %ebp C -size
122:
123: subl %eax, %edi C inv = 2*inv - inv*inv*d
124:
125: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
126: movl %esi, %eax
127: imull %edi, %eax
128: cmpl $1, %eax')
129:
130:
131: C The dependent chain here is
132: C
133: C cycles
134: C subl %edx, %eax 1
135: C imull %edi, %eax 4
136: C mull %esi 6 (high limb)
137: C ----
138: C total 11
139: C
140: C Out of order execution hides the load latency for the source data, so no
141: C special scheduling is required.
142:
143: L(top):
144: C eax src limb
145: C ebx src end ptr
146: C ecx next carry bit, 0 or 1 (or initial carry param)
147: C edx carry limb, high of last product
148: C esi divisor
149: C edi inverse
150: C ebp counter, limbs, negative
151:
152: movl (%ebx,%ebp,4), %eax
153:
154: subl %ecx, %eax C apply carry bit
155: movl $0, %ecx
156:
157: setc %cl C new carry bit
158:
159: subl %edx, %eax C apply carry limb
160: adcl $0, %ecx
161:
162: imull %edi, %eax
163:
164: mull %esi
165:
166: incl %ebp
167: jnz L(top)
168:
169:
170: movl SAVE_ESI, %esi
171: movl SAVE_EDI, %edi
172: leal (%ecx,%edx), %eax
173:
174: movl SAVE_EBX, %ebx
175: movl SAVE_EBP, %ebp
176: addl $STACK_SPACE, %esp
177:
178: ret
179:
180: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>