Annotation of OpenXM_contrib/gmp/mpn/ia64/popcount.asm, Revision 1.1.1.1
1.1 ohara 1: dnl IA-64 mpn_popcount.
2:
3: dnl Copyright 2000, 2001 Free Software Foundation, Inc.
4:
5: dnl This file is part of the GNU MP Library.
6:
7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
8: dnl it under the terms of the GNU Lesser General Public License as published
9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
10: dnl your option) any later version.
11:
12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: dnl License for more details.
16:
17: dnl You should have received a copy of the GNU Lesser General Public License
18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: dnl MA 02111-1307, USA.
21:
22: dnl Runs at 1 cycle/limb on the Itanium. That is the peak performance for the
23: dnl popcnt instruction, so this is optimal code. It should be straightforward
24: dnl to write mpn_hamdist with the same awesome performance.
25:
26: include(`../config.m4')
27:
28: C INPUT PARAMETERS
29: C sp = r32
30: C n = r33
31:
32: ASM_START()
33: PROLOGUE(mpn_popcount)
34: .prologue
35: .save ar.lc, r2
36: mov r2 = ar.lc
37: .body
38: and r22 = 3, r33
39: shr.u r23 = r33, 2 ;;
40: mov ar.lc = r22
41: mov r8 = 0 ;;
42: br.cloop.dpnt .Loop0 ;;
43: br .L0
44: .Loop0: ld8 r16 = [r32], 8 ;;
45: popcnt r20 = r16 ;;
46: add r8 = r8, r20
47: br.cloop.dptk .Loop0 ;;
48:
49: .L0: mov ar.lc = r23 ;;
50: br.cloop.dptk .L1 ;;
51: br.ret.sptk.many b0 ;;
52: .L1: ld8 r16 = [r32], 8 ;;
53: ld8 r17 = [r32], 8 ;;
54: ld8 r18 = [r32], 8 ;;
55: ld8 r19 = [r32], 8 ;;
56: br.cloop.dptk .L2 ;;
57: br .Ldone1 ;;
58: .L2:
59: popcnt r20 = r16
60: ld8 r16 = [r32], 8 ;;
61: popcnt r21 = r17
62: ld8 r17 = [r32], 8 ;;
63: popcnt r22 = r18
64: ld8 r18 = [r32], 8 ;;
65: popcnt r23 = r19
66: ld8 r19 = [r32], 8 ;;
67: br.cloop.dptk .Loop ;;
68: br .Ldone0
69:
70: .Loop: add r8 = r8, r20
71: popcnt r20 = r16
72: ld8 r16 = [r32], 8 ;;
73: add r8 = r8, r21
74: popcnt r21 = r17
75: ld8 r17 = [r32], 8 ;;
76: add r8 = r8, r22
77: popcnt r22 = r18
78: ld8 r18 = [r32], 8 ;;
79: add r8 = r8, r23
80: popcnt r23 = r19
81: ld8 r19 = [r32], 8
82: br.cloop.dptk .Loop ;;
83:
84: .Ldone0:
85: add r8 = r8, r20
86: popcnt r20 = r16 ;;
87: add r8 = r8, r21
88: popcnt r21 = r17 ;;
89: add r8 = r8, r22
90: popcnt r22 = r18 ;;
91: add r8 = r8, r23
92: popcnt r23 = r19 ;;
93: add r21 = r21, r20
94: add r23 = r23, r22 ;;
95: add r8 = r8, r21 ;;
96: add r8 = r8, r23
97: br.ret.sptk.many b0
98:
99: .Ldone1:
100: popcnt r20 = r16
101: popcnt r21 = r17
102: popcnt r22 = r18
103: popcnt r23 = r19 ;;
104: add r21 = r21, r20
105: add r23 = r23, r22 ;;
106: add r8 = r8, r21 ;;
107: add r8 = r8, r23
108: mov ar.lc = r2
109: br.ret.sptk.many b0
110: EPILOGUE(mpn_popcount)
111: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>