Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium-4 mpn_mul_basecase -- mpn by mpn multiplication.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P4: 6.0 cycles/crossproduct (approx)
26:
27:
28: C void mpn_mul_basecase (mp_ptr wp,
29: C mp_srcptr xp, mp_size_t xsize,
30: C mp_srcptr yp, mp_size_t ysize);
31: C
32: C Nothing special here, basically just mpn/generic/mul_basecase.c done with
33: C mpn_mul_1 and mpn_addmul_1 inline. As per mpn_addmul_1, the dependent
34: C chain in the inner loop is 4 c/l, but measures about 6.
35: C
36: C Enhancements:
37: C
38: C Perhaps some sort of vertical method would suit, though there'd be branch
39: C mispredictions on the end sections. But it's not clear how to get less
40: C than 4 instructions per crossproduct, and unless that can be done then a
41: C basic addmul_1 style may as well be used (assuming it can be brought up to
42: C its proper 4 c/l).
43:
44: defframe(PARAM_YSIZE, 20)
45: defframe(PARAM_YP, 16)
46: defframe(PARAM_XSIZE, 12)
47: defframe(PARAM_XP, 8)
48: defframe(PARAM_WP, 4)
49:
50: define(SAVE_EBX,`PARAM_XP')
51: define(SAVE_ESI,`PARAM_YP')
52: define(SAVE_EDI,`PARAM_YSIZE')
53: define(SAVE_EBP,`PARAM_WP')
54:
55: TEXT
56: ALIGN(8)
57: PROLOGUE(mpn_mul_basecase)
58: deflit(`FRAME',0)
59:
60: movl PARAM_XP, %eax
61: movl %ebx, SAVE_EBX
62: pxor %mm0, %mm0 C initial carry
63:
64: movl PARAM_YP, %edx
65: movl %esi, SAVE_ESI
66:
67: movl PARAM_WP, %ebx
68: movl %ebp, SAVE_EBP
69: movl %eax, %esi C xp
70:
71: movd (%edx), %mm7 C yp[0]
72:
73: movl PARAM_XSIZE, %ecx
74:
75: movl PARAM_YSIZE, %ebp
76: movl %edi, SAVE_EDI
77: movl %ebx, %edi C wp
78:
79: L(mul1):
80: C eax xp, incrementing
81: C ebx wp, incrementing
82: C ecx xsize, decrementing
83: C edx yp
84: C esi xp
85: C edi wp
86: C ebp ysize
87: C
88: C mm0 carry limb
89: C mm7 multiplier
90:
91: movd (%eax), %mm1
92: addl $4, %eax
93: pmuludq %mm7, %mm1
94: paddq %mm1, %mm0
95: movd %mm0, (%ebx)
96: addl $4, %ebx
97: psrlq $32, %mm0
98: subl $1, %ecx
99: jnz L(mul1)
100:
101: movd %mm0, (%ebx)
102:
103: subl $1, %ebp
104: jz L(done)
105:
106:
107: L(outer):
108: C eax
109: C ebx
110: C ecx
111: C edx yp, incrementing
112: C esi xp
113: C edi wp, incrementing
114: C ebp ysize, decrementing
115:
116: movl %esi, %eax C xp
117:
118: leal 4(%edi), %ebx C next wp
119: addl $4, %edi
120:
121: movd 4(%edx), %mm7 C next yp limb
122: addl $4, %edx
123:
124: pxor %mm0, %mm0 C initial carry
125:
126: movl PARAM_XSIZE, %ecx
127:
128:
129: L(inner):
130: C eax xp, incrementing
131: C ebx wp, incrementing
132: C ecx xsize, decrementing
133: C edx outer yp
134: C esi outer xp
135: C edi outer wp
136: C ebp outer ysize
137:
138: movd (%eax), %mm1
139: leal 4(%eax), %eax
140: movd (%ebx),%mm2
141: pmuludq %mm7, %mm1
142: paddq %mm2, %mm1
143: paddq %mm1, %mm0
144: subl $1, %ecx
145: movd %mm0, (%ebx)
146: psrlq $32, %mm0
147: leal 4(%ebx), %ebx
148: jnz L(inner)
149:
150: movd %mm0, (%ebx)
151:
152: subl $1, %ebp
153: jnz L(outer)
154:
155:
156: L(done):
157: movl SAVE_EBX, %ebx
158: movl SAVE_ESI, %esi
159: movl SAVE_EDI, %edi
160: movl SAVE_EBP, %ebp
161: emms
162: ret
163:
164: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>