Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium-4 mpn_sqr_basecase -- square an mpn number.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P4: approx 3.5 cycles per crossproduct, or 7 cycles per triangular
26: C product, at around 30x30 limbs.
27:
28:
29: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
30: C
31: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
32: C lot of function call overheads are avoided, especially when the size is
33: C small.
34: C
35: C On small sizes there's only a small speedup over mpn_mul_basecase,
36: C presumably branch mispredictions are a bigger fraction of the work done.
37: C It's not clear how to help this.
38:
39: defframe(PARAM_SIZE,12)
40: defframe(PARAM_SRC, 8)
41: defframe(PARAM_DST, 4)
42:
43: TEXT
44: ALIGN(8)
45: PROLOGUE(mpn_sqr_basecase)
46: deflit(`FRAME',0)
47:
48: movl PARAM_SIZE, %edx
49: movl PARAM_SRC, %eax
50: movl PARAM_DST, %ecx
51:
52: cmpl $2, %edx
53:
54: je L(two_limbs)
55: ja L(three_or_more)
56:
57: C -----------------------------------------------------------------------------
58: C one limb only
59: C eax src
60: C ebx
61: C ecx dst
62: C edx
63:
64: movl (%eax), %eax
65: mull %eax
66:
67: movl %eax, (%ecx)
68: movl %edx, 4(%ecx)
69:
70: ret
71:
72: C -----------------------------------------------------------------------------
73: L(two_limbs):
74: C eax src
75: C ebx
76: C ecx dst
77: C edx size
78:
79: movd (%eax), %mm1
80: movd 4(%eax), %mm0
81: pmuludq %mm1, %mm0 C src[0]*src[1]
82:
83: pmuludq %mm1, %mm1 C src[0]^2
84:
85: movd 4(%eax), %mm2
86: pmuludq %mm2, %mm2 C src[1]^2
87:
88: movd %mm1, (%ecx) C dst[0]
89: psrlq $32, %mm1
90:
91: pcmpeqd %mm3, %mm3
92: psrlq $32, %mm3 C 0x00000000FFFFFFFF
93: pand %mm0, %mm3 C low(src[0]*src[1])
94: psrlq $32, %mm0 C high(src[0]*src[1])
95:
96: psllq $1, %mm3 C 2*low(src[0]*src[1])
97: paddq %mm3, %mm1 C high(src[0]^2)
98: movd %mm1, 4(%ecx) C dst[1]
99:
100: pcmpeqd %mm4, %mm4
101: psrlq $32, %mm4 C 0x00000000FFFFFFFF
102: pand %mm2, %mm4 C low(src[1]^2)
103: psrlq $32, %mm2 C high(src[1]^2)
104:
105: psllq $1, %mm0 C 2*high(src[0]*src[1])
106: psrlq $32, %mm1 C carry
107: paddq %mm1, %mm0
108: paddq %mm4, %mm0 C low(src[1]^2)
109: movd %mm0, 8(%ecx) C dst[2]
110:
111: psrlq $32, %mm0 C carry
112: paddq %mm2, %mm0 C high(src[1]^2)
113: movd %mm0, 12(%ecx) C dst[3]
114:
115: ASSERT(z,`
116: psrlq $32, %mm0
117: movd %mm0, %eax
118: orl %eax, %eax')
119:
120: emms
121: ret
122:
123:
124: C -----------------------------------------------------------------------------
125: L(three_or_more):
126:
127: C eax src
128: C ebx
129: C ecx dst
130: C edx size
131: C esi
132: C edi
133: C ebp
134: C
135: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
136:
137: defframe(SAVE_ESI, -4)
138: defframe(SAVE_EDI, -8)
139: defframe(SAVE_EBP, -12)
140: deflit(STACK_SPACE, 12)
141:
142: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
143: pxor %mm0, %mm0 C initial carry
144: movd (%eax), %mm7 C multiplier
145:
146: movl %esi, SAVE_ESI
147: movl %edi, SAVE_EDI
148: movl %ebp, SAVE_EBP
149:
150:
151: movl %eax, %esi
152: movl %ecx, %edi
153: subl $1, %edx
154:
155: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
156: L(mul1):
157: C eax src, incrementing
158: C ebx
159: C ecx dst, incrementing
160: C edx counter, size-1 iterations
161: C esi src
162: C edi dst
163: C ebp
164: C
165: C mm0 carry limb
166: C mm7 multiplier
167:
168: movd 4(%eax), %mm1
169: addl $4, %eax
170: pmuludq %mm7, %mm1
171: paddq %mm1, %mm0
172: movd %mm0, 4(%ecx)
173: addl $4, %ecx
174: psrlq $32, %mm0
175: subl $1, %edx
176: jnz L(mul1)
177:
178:
179: movl PARAM_SIZE, %ebp
180: subl $3, %ebp
181: jz L(corner)
182:
183:
184: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
185: C n=1..size-2. The last two products, which are the end corner of
186: C the product triangle, are handled separately to save looping
187: C overhead.
188:
189: L(outer):
190: C eax
191: C ebx
192: C ecx
193: C edx
194: C esi src, incrementing
195: C edi dst, incrementing
196: C ebp size, decrementing
197: C
198: C mm0 prev carry
199:
200: movd 4(%esi), %mm7 C multiplier
201: movd %mm0, 4(%ecx) C prev carry
202:
203: leal 8(%esi), %eax C next src
204: addl $4, %esi
205:
206: leal 8(%edi), %ecx C next dst
207: addl $8, %edi
208:
209: leal 1(%ebp), %edx C counter
210:
211: pxor %mm0, %mm0 C initial carry limb, clear carry flag
212:
213: L(inner):
214: C eax src, incrementing
215: C edx
216: C ecx dst, incrementing
217: C edx counter
218: C esi outer src
219: C edi outer dst
220: C ebp outer size
221: C
222: C mm0 carry
223:
224: movd (%eax), %mm1
225: leal 4(%eax), %eax
226: movd 4(%ecx),%mm2
227: pmuludq %mm7, %mm1
228: paddq %mm2, %mm1
229: paddq %mm1, %mm0
230: subl $1, %edx
231: movd %mm0, 4(%ecx)
232: psrlq $32, %mm0
233: leal 4(%ecx), %ecx
234: jnz L(inner)
235:
236: subl $1, %ebp
237: jnz L(outer)
238:
239:
240: L(corner):
241: C esi &src[size-3]
242: C edi &dst[2*size-6]
243: C mm0 carry
244: C
245: C +-----+-----+--
246: C | mm0 | dst so far
247: C +-----+-----+--
248: C +-----+-----+
249: C | | | src[size-2]*src[size-1]
250: C +-----+-----+
251:
252: movd 4(%esi), %mm1
253: movd 8(%esi), %mm2
254: pmuludq %mm2, %mm1 C src[size-1]*src[size-2]
255:
256: movl PARAM_SRC, %eax
257: movd (%eax), %mm2
258: pmuludq %mm2, %mm2 C src[0]^2
259:
260: pcmpeqd %mm7, %mm7
261: psrlq $32, %mm7
262:
263: movl PARAM_DST, %edx
264: movd 4(%edx), %mm3 C dst[1]
265:
266: paddq %mm1, %mm0
267: movd %mm0, 12(%edi) C dst[2*size-3]
268:
269: psrlq $32, %mm0
270: movd %mm0, 16(%edi) C dst[2*size-2]
271:
272: movd %mm2, (%edx) C dst[0]
273: psrlq $32, %mm2
274:
275: psllq $1, %mm3 C 2*dst[1]
276: paddq %mm3, %mm2
277: movd %mm2, 4(%edx)
278: psrlq $32, %mm2
279:
280: movl PARAM_SIZE, %ecx
281: subl $2, %ecx
282:
283: C Now form squares on the diagonal src[0]^2,...,src[size-1]^2, and
284: C add to the triangular parts dst[1..2*size-2] with those left
285: C shifted by 1 bit.
286:
287: L(diag):
288: C eax src, incrementing
289: C ebx
290: C ecx counter, size-2 iterations
291: C edx dst, incrementing
292: C esi
293: C edi
294: C ebp
295: C
296: C mm2 carry
297: C mm7 0x00000000FFFFFFFF
298:
299: movd 4(%eax), %mm0 C src limb
300: addl $4, %eax
301: pmuludq %mm0, %mm0
302: movq %mm7, %mm1
303: pand %mm0, %mm1 C diagonal low
304: psrlq $32, %mm0 C diagonal high
305:
306: movd 8(%edx), %mm3
307: psllq $1, %mm3 C 2*dst[i]
308: paddq %mm3, %mm1
309: paddq %mm1, %mm2
310: movd %mm2, 8(%edx)
311: psrlq $32, %mm2
312:
313: movd 12(%edx), %mm3
314: psllq $1, %mm3 C 2*dst[i+1]
315: paddq %mm3, %mm0
316: paddq %mm0, %mm2
317: movd %mm2, 12(%edx)
318: addl $8, %edx
319: psrlq $32, %mm2
320:
321: subl $1, %ecx
322: jnz L(diag)
323:
324:
325: movd 4(%eax), %mm0 C src[size-1]
326: pmuludq %mm0, %mm0
327: pand %mm0, %mm7 C diagonal low
328: psrlq $32, %mm0 C diagonal high
329:
330: movd 8(%edx), %mm3 C dst[2*size-2]
331: psllq $1, %mm3
332: paddq %mm3, %mm7
333: paddq %mm7, %mm2
334: movd %mm2, 8(%edx)
335: psrlq $32, %mm2
336:
337: paddq %mm0, %mm2
338: movd %mm2, 12(%edx) C dst[2*size-1]
339:
340: ASSERT(z,` C no further carry
341: psrlq $32, %mm2
342: movd %mm2, %eax
343: orl %eax, %eax')
344:
345:
346: movl SAVE_ESI, %esi
347: movl SAVE_EDI, %edi
348: movl SAVE_EBP, %ebp
349: addl $STACK_SPACE, %esp
350: emms
351: ret
352:
353: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>