Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/popham.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
2: dnl hamming distance.
3: dnl
4: dnl popcount hamdist
5: dnl K6-2: 9.0 11.5 cycles/limb
6: dnl K6: 12.5 13.0
7:
8:
9: dnl Copyright (C) 2000 Free Software Foundation, Inc.
10: dnl
11: dnl This file is part of the GNU MP Library.
12: dnl
13: dnl The GNU MP Library is free software; you can redistribute it and/or
14: dnl modify it under the terms of the GNU Lesser General Public License as
15: dnl published by the Free Software Foundation; either version 2.1 of the
16: dnl License, or (at your option) any later version.
17: dnl
18: dnl The GNU MP Library is distributed in the hope that it will be useful,
19: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21: dnl Lesser General Public License for more details.
22: dnl
23: dnl You should have received a copy of the GNU Lesser General Public
24: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26: dnl Suite 330, Boston, MA 02111-1307, USA.
27:
28:
29: include(`../config.m4')
30:
31:
32: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
33: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
34: C
35: C The code here isn't optimal, but it's already a 2x speedup over the plain
36: C integer mpn/generic/popcount.c,hamdist.c.
37:
38:
39: ifdef(`OPERATION_popcount',,
40: `ifdef(`OPERATION_hamdist',,
41: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
42: ')m4exit(1)')')
43:
44: define(HAM,
45: m4_assert_numargs(1)
46: `ifdef(`OPERATION_hamdist',`$1')')
47:
48: define(POP,
49: m4_assert_numargs(1)
50: `ifdef(`OPERATION_popcount',`$1')')
51:
52: HAM(`
53: defframe(PARAM_SIZE, 12)
54: defframe(PARAM_SRC2, 8)
55: defframe(PARAM_SRC, 4)
56: define(M4_function,mpn_hamdist)
57: ')
58: POP(`
59: defframe(PARAM_SIZE, 8)
60: defframe(PARAM_SRC, 4)
61: define(M4_function,mpn_popcount)
62: ')
63:
64: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
65:
66:
67: ifdef(`PIC',,`
68: dnl non-PIC
69:
1.1.1.2 ! maekawa 70: DATA
1.1 maekawa 71: ALIGN(8)
72:
73: define(LS,
74: m4_assert_numargs(1)
75: `LF(M4_function,`$1')')
76:
77: LS(rodata_AAAAAAAAAAAAAAAA):
78: .long 0xAAAAAAAA
79: .long 0xAAAAAAAA
80:
81: LS(rodata_3333333333333333):
82: .long 0x33333333
83: .long 0x33333333
84:
85: LS(rodata_0F0F0F0F0F0F0F0F):
86: .long 0x0F0F0F0F
87: .long 0x0F0F0F0F
88:
89: LS(rodata_000000FF000000FF):
90: .long 0x000000FF
91: .long 0x000000FF
92: ')
93:
94: .text
95: ALIGN(32)
96:
97: POP(`ifdef(`PIC', `
98: C avoid shrl crossing a 32-byte boundary
99: nop')')
100:
101: PROLOGUE(M4_function)
102: deflit(`FRAME',0)
103:
104: movl PARAM_SIZE, %ecx
105: orl %ecx, %ecx
106: jz L(zero)
107:
108: ifdef(`PIC',`
109: movl $0xAAAAAAAA, %eax
110: movl $0x33333333, %edx
111:
112: movd %eax, %mm7
113: movd %edx, %mm6
114:
115: movl $0x0F0F0F0F, %eax
116: movl $0x000000FF, %edx
117:
118: punpckldq %mm7, %mm7
119: punpckldq %mm6, %mm6
120:
121: movd %eax, %mm5
122: movd %edx, %mm4
123:
124: punpckldq %mm5, %mm5
125: punpckldq %mm4, %mm4
126: ',`
127:
128: movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
129: movq LS(rodata_3333333333333333), %mm6
130: movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
131: movq LS(rodata_000000FF000000FF), %mm4
132: ')
133:
134: define(REG_AAAAAAAAAAAAAAAA, %mm7)
135: define(REG_3333333333333333, %mm6)
136: define(REG_0F0F0F0F0F0F0F0F, %mm5)
137: define(REG_000000FF000000FF, %mm4)
138:
139:
140: movl PARAM_SRC, %eax
141: HAM(` movl PARAM_SRC2, %edx')
142:
143: pxor %mm2, %mm2 C total
144:
145: shrl %ecx
146: jnc L(top)
147:
148: Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
149:
150: HAM(`
151: Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
152: pxor %mm0, %mm1
153: ')
154:
155: incl %ecx
156: jmp L(loaded)
157:
158:
159: ALIGN(16)
160: POP(` nop C alignment to avoid crossing 32-byte boundaries')
161:
162: L(top):
163: C eax src
164: C ebx
165: C ecx counter, qwords, decrementing
166: C edx [hamdist] src2
167: C
168: C mm0 (scratch)
169: C mm1 (scratch)
170: C mm2 total (low dword)
171: C mm3
172: C mm4 \
173: C mm5 | special constants
174: C mm6 |
175: C mm7 /
176:
177: movq -8(%eax,%ecx,8), %mm1
178: HAM(` pxor -8(%edx,%ecx,8), %mm1')
179:
180: L(loaded):
181: movq %mm1, %mm0
182: pand REG_AAAAAAAAAAAAAAAA, %mm1
183:
184: psrlq $1, %mm1
185: HAM(` nop C code alignment')
186:
187: psubd %mm1, %mm0 C bit pairs
188: HAM(` nop C code alignment')
189:
190:
191: movq %mm0, %mm1
192: psrlq $2, %mm0
193:
194: pand REG_3333333333333333, %mm0
195: pand REG_3333333333333333, %mm1
196:
197: paddd %mm1, %mm0 C nibbles
198:
199:
200: movq %mm0, %mm1
201: psrlq $4, %mm0
202:
203: pand REG_0F0F0F0F0F0F0F0F, %mm0
204: pand REG_0F0F0F0F0F0F0F0F, %mm1
205:
206: paddd %mm1, %mm0 C bytes
207:
208: movq %mm0, %mm1
209: psrlq $8, %mm0
210:
211:
212: paddb %mm1, %mm0 C words
213:
214:
215: movq %mm0, %mm1
216: psrlq $16, %mm0
217:
218: paddd %mm1, %mm0 C dwords
219:
220: pand REG_000000FF000000FF, %mm0
221:
222: paddd %mm0, %mm2 C low to total
223: psrlq $32, %mm0
224:
225: paddd %mm0, %mm2 C high to total
226: loop L(top)
227:
228:
229:
230: movd %mm2, %eax
231: emms_or_femms
232: ret
233:
234: L(zero):
235: movl $0, %eax
236: ret
237:
238: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>