Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/mmx/popham.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
2: dnl hamming distance.
3:
4: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5: dnl
6: dnl This file is part of the GNU MP Library.
7: dnl
8: dnl The GNU MP Library is free software; you can redistribute it and/or
9: dnl modify it under the terms of the GNU Lesser General Public License as
10: dnl published by the Free Software Foundation; either version 2.1 of the
11: dnl License, or (at your option) any later version.
12: dnl
13: dnl The GNU MP Library is distributed in the hope that it will be useful,
14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16: dnl Lesser General Public License for more details.
17: dnl
18: dnl You should have received a copy of the GNU Lesser General Public
19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21: dnl Suite 330, Boston, MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25:
26: C P4: popcount 8.5 cycles/limb
27: C hamdist 9.5 cycles/limb
28:
29:
30: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
31: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
32: C
33: C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
34: C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
35: C and using them saves fiddling about with alignment testing on entry.
36: C
37: C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
38: C might be possible, but 8.5 c/l relying on out-of-order execution is
39: C already quite reasonable.
40:
41: ifdef(`OPERATION_popcount',,
42: `ifdef(`OPERATION_hamdist',,
43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
44: ')')')
45:
46: define(HAM,
47: m4_assert_numargs(1)
48: `ifdef(`OPERATION_hamdist',`$1')')
49:
50: define(POP,
51: m4_assert_numargs(1)
52: `ifdef(`OPERATION_popcount',`$1')')
53:
54: HAM(`
55: defframe(PARAM_SIZE, 12)
56: defframe(PARAM_SRC2, 8)
57: defframe(PARAM_SRC, 4)
58: define(M4_function,mpn_hamdist)
59: ')
60: POP(`
61: defframe(PARAM_SIZE, 8)
62: defframe(PARAM_SRC, 4)
63: define(M4_function,mpn_popcount)
64: ')
65:
66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
67:
68:
69: ifdef(`PIC',,`
70: dnl non-PIC
71: RODATA
72: ALIGN(8)
73: L(rodata_AAAAAAAAAAAAAAAA):
74: .long 0xAAAAAAAA
75: .long 0xAAAAAAAA
76: L(rodata_3333333333333333):
77: .long 0x33333333
78: .long 0x33333333
79: L(rodata_0F0F0F0F0F0F0F0F):
80: .long 0x0F0F0F0F
81: .long 0x0F0F0F0F
82: ')
83:
84: TEXT
85: ALIGN(16)
86:
87: PROLOGUE(M4_function)
88: deflit(`FRAME',0)
89:
90: movl PARAM_SIZE, %ecx
91: movl PARAM_SRC, %eax
92:
93: ifdef(`PIC',`
94: movl $0xAAAAAAAA, %edx
95: movd %edx, %mm7
96: punpckldq %mm7, %mm7
97:
98: movl $0x33333333, %edx
99: movd %edx, %mm6
100: punpckldq %mm6, %mm6
101:
102: movl $0x0F0F0F0F, %edx
103: movd %edx, %mm5
104: punpckldq %mm5, %mm5
105:
106: HAM(` movl PARAM_SRC2, %edx')
107:
108: ',`
109: dnl non-PIC
110: HAM(` movl PARAM_SRC2, %edx')
111: movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
112: movq L(rodata_3333333333333333), %mm6
113: movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
114: ')
115:
116: pxor %mm4, %mm4 C zero
117: pxor %mm0, %mm0 C total
118:
119: subl $1, %ecx
120: ja L(top)
121:
122: L(last):
123: movd (%eax,%ecx,4), %mm1 C src high limb
124: HAM(` movd (%edx,%ecx,4), %mm2
125: pxor %mm2, %mm1
126: ')
127: jmp L(loaded)
128:
129:
130: L(top):
131: C eax src
132: C ebx
133: C ecx counter, size-1 to 2 or 1, inclusive
134: C edx [hamdist] src2
135: C
136: C mm0 total (low dword)
137: C mm1 (scratch)
138: C mm2 (scratch)
139: C mm3
140: C mm4 0x0000000000000000
141: C mm5 0x0F0F0F0F0F0F0F0F
142: C mm6 0x3333333333333333
143: C mm7 0xAAAAAAAAAAAAAAAA
144:
145: movd (%eax), %mm1
146: movd 4(%eax), %mm2
147: punpckldq %mm2, %mm1
148: addl $8, %eax
149:
150: HAM(` movd (%edx), %mm2
151: movd 4(%edx), %mm3
152: punpckldq %mm3, %mm2
153: pxor %mm2, %mm1
154: addl $8, %edx
155: ')
156:
157: L(loaded):
158: movq %mm7, %mm2
159: pand %mm1, %mm2
160: psrlq $1, %mm2
161: psubd %mm2, %mm1 C bit pairs
162:
163: movq %mm6, %mm2
164: pand %mm1, %mm2
165: psrlq $2, %mm1
166: pand %mm6, %mm1
167: paddd %mm2, %mm1 C nibbles
168:
169: movq %mm5, %mm2
170: pand %mm1, %mm2
171: psrlq $4, %mm1
172: pand %mm5, %mm1
173: paddd %mm2, %mm1 C bytes
174:
175: psadbw( %mm4, %mm1)
176: paddd %mm1, %mm0 C to total
177:
178: subl $2, %ecx
179: jg L(top)
180:
181: C ecx is 0 or -1 representing respectively 1 or 0 further limbs
182: jz L(last)
183:
184:
185: movd %mm0, %eax
186: emms
187: ret
188:
189: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>