Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/popham.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
! 2: dnl distance.
! 3: dnl
! 4: dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
! 5:
! 6:
! 7: dnl Copyright (C) 2000 Free Software Foundation, Inc.
! 8: dnl
! 9: dnl This file is part of the GNU MP Library.
! 10: dnl
! 11: dnl The GNU MP Library is free software; you can redistribute it and/or
! 12: dnl modify it under the terms of the GNU Lesser General Public License as
! 13: dnl published by the Free Software Foundation; either version 2.1 of the
! 14: dnl License, or (at your option) any later version.
! 15: dnl
! 16: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 19: dnl Lesser General Public License for more details.
! 20: dnl
! 21: dnl You should have received a copy of the GNU Lesser General Public
! 22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 24: dnl Suite 330, Boston, MA 02111-1307, USA.
! 25:
! 26:
! 27: include(`../config.m4')
! 28:
! 29:
! 30: dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on
! 31: dnl FreeBSD 3.3 and 3.4 doesn't recognise it.
! 32:
! 33: define(psadbw_mm4_mm0,
! 34: `ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
! 35: `HAVE_TARGET_CPU_pentium3'),1,
! 36: `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0',
! 37:
! 38: `m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
! 39: ') C this works enough for the sum of bytes done below, making it
! 40: C possible to test on an older cpu
! 41: leal -8(%esp), %esp
! 42: movq %mm4, (%esp)
! 43: movq %mm0, %mm4
! 44: forloop(i,1,7,
! 45: ` psrlq $ 8, %mm4
! 46: paddb %mm4, %mm0
! 47: ')
! 48: pushl $ 0
! 49: pushl $ 0xFF
! 50: pand (%esp), %mm0
! 51: movq 8(%esp), %mm4
! 52: leal 16(%esp), %esp
! 53: ')')
! 54:
! 55:
! 56: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
! 57: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
! 58: C
! 59: C The code here is almost certainly not optimal, but is already a 3x speedup
! 60: C over the generic C code. The main improvement would be to interleave
! 61: C processing of two qwords in the loop so as to fully exploit the available
! 62: C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
! 63: C
! 64: C The loop is based on the example "Efficient 64-bit population count using
! 65: C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
! 66: C page 158 of rev E (reference in mpn/x86/k7/README).
! 67:
! 68: ifdef(`OPERATION_popcount',,
! 69: `ifdef(`OPERATION_hamdist',,
! 70: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
! 71: ')')')
! 72:
! 73: define(HAM,
! 74: m4_assert_numargs(1)
! 75: `ifdef(`OPERATION_hamdist',`$1')')
! 76:
! 77: define(POP,
! 78: m4_assert_numargs(1)
! 79: `ifdef(`OPERATION_popcount',`$1')')
! 80:
! 81: HAM(`
! 82: defframe(PARAM_SIZE, 12)
! 83: defframe(PARAM_SRC2, 8)
! 84: defframe(PARAM_SRC, 4)
! 85: define(M4_function,mpn_hamdist)
! 86: ')
! 87: POP(`
! 88: defframe(PARAM_SIZE, 8)
! 89: defframe(PARAM_SRC, 4)
! 90: define(M4_function,mpn_popcount)
! 91: ')
! 92:
! 93: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
! 94:
! 95:
! 96: ifdef(`PIC',,`
! 97: dnl non-PIC
! 98:
! 99: .section .rodata
! 100: ALIGN(8)
! 101:
! 102: define(LS,
! 103: m4_assert_numargs(1)
! 104: `LF(M4_function,`$1')')
! 105:
! 106: LS(rodata_AAAAAAAAAAAAAAAA):
! 107: .long 0xAAAAAAAA
! 108: .long 0xAAAAAAAA
! 109:
! 110: LS(rodata_3333333333333333):
! 111: .long 0x33333333
! 112: .long 0x33333333
! 113:
! 114: LS(rodata_0F0F0F0F0F0F0F0F):
! 115: .long 0x0F0F0F0F
! 116: .long 0x0F0F0F0F
! 117: ')
! 118:
! 119: .text
! 120: ALIGN(32)
! 121:
! 122: PROLOGUE(M4_function)
! 123: deflit(`FRAME',0)
! 124:
! 125: movl PARAM_SIZE, %ecx
! 126: orl %ecx, %ecx
! 127: jz L(zero)
! 128:
! 129: ifdef(`PIC',`
! 130: movl $0xAAAAAAAA, %eax
! 131: movl $0x33333333, %edx
! 132:
! 133: movd %eax, %mm7
! 134: movd %edx, %mm6
! 135:
! 136: movl $0x0F0F0F0F, %eax
! 137:
! 138: punpckldq %mm7, %mm7
! 139: punpckldq %mm6, %mm6
! 140:
! 141: movd %eax, %mm5
! 142: movd %edx, %mm4
! 143:
! 144: punpckldq %mm5, %mm5
! 145:
! 146: ',`
! 147: movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
! 148: movq LS(rodata_3333333333333333), %mm6
! 149: movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
! 150: ')
! 151: pxor %mm4, %mm4
! 152:
! 153: define(REG_AAAAAAAAAAAAAAAA,%mm7)
! 154: define(REG_3333333333333333,%mm6)
! 155: define(REG_0F0F0F0F0F0F0F0F,%mm5)
! 156: define(REG_0000000000000000,%mm4)
! 157:
! 158:
! 159: movl PARAM_SRC, %eax
! 160: HAM(` movl PARAM_SRC2, %edx')
! 161:
! 162: pxor %mm2, %mm2 C total
! 163:
! 164: shrl %ecx
! 165: jnc L(top)
! 166:
! 167: movd (%eax,%ecx,8), %mm1
! 168:
! 169: HAM(` movd 0(%edx,%ecx,8), %mm0
! 170: pxor %mm0, %mm1
! 171: ')
! 172: orl %ecx, %ecx
! 173: jmp L(loaded)
! 174:
! 175:
! 176: ALIGN(16)
! 177: L(top):
! 178: C eax src
! 179: C ebx
! 180: C ecx counter, qwords, decrementing
! 181: C edx [hamdist] src2
! 182: C
! 183: C mm0 (scratch)
! 184: C mm1 (scratch)
! 185: C mm2 total (low dword)
! 186: C mm3
! 187: C mm4 \
! 188: C mm5 | special constants
! 189: C mm6 |
! 190: C mm7 /
! 191:
! 192: movq -8(%eax,%ecx,8), %mm1
! 193:
! 194: HAM(` pxor -8(%edx,%ecx,8), %mm1')
! 195: decl %ecx
! 196:
! 197: L(loaded):
! 198: movq %mm1, %mm0
! 199: pand REG_AAAAAAAAAAAAAAAA, %mm1
! 200:
! 201: psrlq $1, %mm1
! 202:
! 203: psubd %mm1, %mm0 C bit pairs
! 204:
! 205:
! 206: movq %mm0, %mm1
! 207: psrlq $2, %mm0
! 208:
! 209: pand REG_3333333333333333, %mm0
! 210: pand REG_3333333333333333, %mm1
! 211:
! 212: paddd %mm1, %mm0 C nibbles
! 213:
! 214:
! 215: movq %mm0, %mm1
! 216: psrlq $4, %mm0
! 217:
! 218: pand REG_0F0F0F0F0F0F0F0F, %mm0
! 219: pand REG_0F0F0F0F0F0F0F0F, %mm1
! 220:
! 221: paddd %mm1, %mm0 C bytes
! 222:
! 223:
! 224: psadbw_mm4_mm0
! 225:
! 226: paddd %mm0, %mm2 C add to total
! 227: jnz L(top)
! 228:
! 229:
! 230: movd %mm2, %eax
! 231: emms
! 232: ret
! 233:
! 234:
! 235: L(zero):
! 236: movl $0, %eax
! 237: ret
! 238:
! 239: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>