Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/popham.asm, Revision 1.1.1.3
1.1 maekawa 1: dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
2: dnl distance.
3:
1.1.1.3 ! ohara 4: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 5: dnl
6: dnl This file is part of the GNU MP Library.
7: dnl
8: dnl The GNU MP Library is free software; you can redistribute it and/or
9: dnl modify it under the terms of the GNU Lesser General Public License as
10: dnl published by the Free Software Foundation; either version 2.1 of the
11: dnl License, or (at your option) any later version.
12: dnl
13: dnl The GNU MP Library is distributed in the hope that it will be useful,
14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16: dnl Lesser General Public License for more details.
17: dnl
18: dnl You should have received a copy of the GNU Lesser General Public
19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21: dnl Suite 330, Boston, MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25:
1.1.1.3 ! ohara 26: C K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
1.1 maekawa 27:
28:
29: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
30: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
31: C
32: C The code here is almost certainly not optimal, but is already a 3x speedup
33: C over the generic C code. The main improvement would be to interleave
34: C processing of two qwords in the loop so as to fully exploit the available
35: C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
36: C
37: C The loop is based on the example "Efficient 64-bit population count using
38: C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
39: C page 158 of rev E (reference in mpn/x86/k7/README).
40:
41: ifdef(`OPERATION_popcount',,
42: `ifdef(`OPERATION_hamdist',,
43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
44: ')')')
45:
46: define(HAM,
47: m4_assert_numargs(1)
48: `ifdef(`OPERATION_hamdist',`$1')')
49:
50: define(POP,
51: m4_assert_numargs(1)
52: `ifdef(`OPERATION_popcount',`$1')')
53:
54: HAM(`
55: defframe(PARAM_SIZE, 12)
56: defframe(PARAM_SRC2, 8)
57: defframe(PARAM_SRC, 4)
58: define(M4_function,mpn_hamdist)
59: ')
60: POP(`
61: defframe(PARAM_SIZE, 8)
62: defframe(PARAM_SRC, 4)
63: define(M4_function,mpn_popcount)
64: ')
65:
66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
67:
68:
69: ifdef(`PIC',,`
70: dnl non-PIC
71:
1.1.1.3 ! ohara 72: RODATA
1.1 maekawa 73: ALIGN(8)
74:
1.1.1.3 ! ohara 75: L(rodata_AAAAAAAAAAAAAAAA):
1.1 maekawa 76: .long 0xAAAAAAAA
77: .long 0xAAAAAAAA
78:
1.1.1.3 ! ohara 79: L(rodata_3333333333333333):
1.1 maekawa 80: .long 0x33333333
81: .long 0x33333333
82:
1.1.1.3 ! ohara 83: L(rodata_0F0F0F0F0F0F0F0F):
1.1 maekawa 84: .long 0x0F0F0F0F
85: .long 0x0F0F0F0F
86: ')
87:
1.1.1.3 ! ohara 88: TEXT
1.1 maekawa 89: ALIGN(32)
90:
91: PROLOGUE(M4_function)
92: deflit(`FRAME',0)
93:
94: movl PARAM_SIZE, %ecx
95:
96: ifdef(`PIC',`
97: movl $0xAAAAAAAA, %eax
98: movl $0x33333333, %edx
99:
100: movd %eax, %mm7
101: movd %edx, %mm6
102:
103: movl $0x0F0F0F0F, %eax
104:
105: punpckldq %mm7, %mm7
106: punpckldq %mm6, %mm6
107:
108: movd %eax, %mm5
109: movd %edx, %mm4
110:
111: punpckldq %mm5, %mm5
112:
113: ',`
1.1.1.3 ! ohara 114: movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
! 115: movq L(rodata_3333333333333333), %mm6
! 116: movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
1.1 maekawa 117: ')
118: pxor %mm4, %mm4
119:
120: define(REG_AAAAAAAAAAAAAAAA,%mm7)
121: define(REG_3333333333333333,%mm6)
122: define(REG_0F0F0F0F0F0F0F0F,%mm5)
123: define(REG_0000000000000000,%mm4)
124:
125:
126: movl PARAM_SRC, %eax
127: HAM(` movl PARAM_SRC2, %edx')
128:
129: pxor %mm2, %mm2 C total
130:
131: shrl %ecx
132: jnc L(top)
133:
134: movd (%eax,%ecx,8), %mm1
135:
1.1.1.3 ! ohara 136: HAM(` movd (%edx,%ecx,8), %mm0
1.1 maekawa 137: pxor %mm0, %mm1
138: ')
139: orl %ecx, %ecx
140: jmp L(loaded)
141:
142:
143: ALIGN(16)
144: L(top):
145: C eax src
146: C ebx
147: C ecx counter, qwords, decrementing
148: C edx [hamdist] src2
149: C
150: C mm0 (scratch)
151: C mm1 (scratch)
152: C mm2 total (low dword)
153: C mm3
154: C mm4 \
155: C mm5 | special constants
156: C mm6 |
157: C mm7 /
158:
159: movq -8(%eax,%ecx,8), %mm1
160:
161: HAM(` pxor -8(%edx,%ecx,8), %mm1')
162: decl %ecx
163:
164: L(loaded):
165: movq %mm1, %mm0
166: pand REG_AAAAAAAAAAAAAAAA, %mm1
167:
168: psrlq $1, %mm1
169:
170: psubd %mm1, %mm0 C bit pairs
171:
172:
173: movq %mm0, %mm1
174: psrlq $2, %mm0
175:
176: pand REG_3333333333333333, %mm0
177: pand REG_3333333333333333, %mm1
178:
179: paddd %mm1, %mm0 C nibbles
180:
181:
182: movq %mm0, %mm1
183: psrlq $4, %mm0
184:
185: pand REG_0F0F0F0F0F0F0F0F, %mm0
186: pand REG_0F0F0F0F0F0F0F0F, %mm1
187:
188: paddd %mm1, %mm0 C bytes
189:
190:
1.1.1.3 ! ohara 191: psadbw( %mm4, %mm0)
1.1 maekawa 192:
193: paddd %mm0, %mm2 C add to total
194: jnz L(top)
195:
196:
197: movd %mm2, %eax
198: emms
199: ret
200:
201: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>