Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aors_n.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
! 2: dnl
! 3: dnl K7: 1.64 cycles/limb (at 16 limb/loop).
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: dnl K7: UNROLL_COUNT cycles/limb
! 30: dnl 8 1.9
! 31: dnl 16 1.64
! 32: dnl 32 1.7
! 33: dnl 64 2.0
! 34: dnl Maximum possible with the current code is 64.
! 35:
! 36: deflit(UNROLL_COUNT, 16)
! 37:
! 38:
! 39: ifdef(`OPERATION_add_n', `
! 40: define(M4_inst, adcl)
! 41: define(M4_function_n, mpn_add_n)
! 42: define(M4_function_nc, mpn_add_nc)
! 43: define(M4_description, add)
! 44: ',`ifdef(`OPERATION_sub_n', `
! 45: define(M4_inst, sbbl)
! 46: define(M4_function_n, mpn_sub_n)
! 47: define(M4_function_nc, mpn_sub_nc)
! 48: define(M4_description, subtract)
! 49: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
! 50: ')')')
! 51:
! 52: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
! 53:
! 54:
! 55: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
! 56: C mp_size_t size);
! 57: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
! 58: C mp_size_t size, mp_limb_t carry);
! 59: C
! 60: C Calculate src1,size M4_description src2,size, and store the result in
! 61: C dst,size. The return value is the carry bit from the top of the result (1
! 62: C or 0).
! 63: C
! 64: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
! 65: C the calculation. Note values other than 1 or 0 here will lead to garbage
! 66: C results.
! 67: C
! 68: C This code runs at 1.64 cycles/limb, which is probably the best possible
! 69: C with plain integer operations. Each limb is 2 loads and 1 store, and in
! 70: C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
! 71: C c/l.
! 72:
! 73: dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
! 74: ifdef(`PIC',`
! 75: deflit(UNROLL_THRESHOLD, 8)
! 76: ',`
! 77: deflit(UNROLL_THRESHOLD, 8)
! 78: ')
! 79:
! 80: defframe(PARAM_CARRY,20)
! 81: defframe(PARAM_SIZE, 16)
! 82: defframe(PARAM_SRC2, 12)
! 83: defframe(PARAM_SRC1, 8)
! 84: defframe(PARAM_DST, 4)
! 85:
! 86: defframe(SAVE_EBP, -4)
! 87: defframe(SAVE_ESI, -8)
! 88: defframe(SAVE_EBX, -12)
! 89: defframe(SAVE_EDI, -16)
! 90: deflit(STACK_SPACE, 16)
! 91:
! 92: .text
! 93: ALIGN(32)
! 94: deflit(`FRAME',0)
! 95:
! 96: PROLOGUE(M4_function_nc)
! 97: movl PARAM_CARRY, %eax
! 98: jmp LF(M4_function_n,start)
! 99: EPILOGUE()
! 100:
! 101: PROLOGUE(M4_function_n)
! 102:
! 103: xorl %eax, %eax C carry
! 104: L(start):
! 105: movl PARAM_SIZE, %ecx
! 106: subl $STACK_SPACE, %esp
! 107: deflit(`FRAME',STACK_SPACE)
! 108:
! 109: movl %edi, SAVE_EDI
! 110: movl %ebx, SAVE_EBX
! 111: cmpl $UNROLL_THRESHOLD, %ecx
! 112:
! 113: movl PARAM_SRC2, %edx
! 114: movl PARAM_SRC1, %ebx
! 115: jae L(unroll)
! 116:
! 117: movl PARAM_DST, %edi
! 118: leal (%ebx,%ecx,4), %ebx
! 119: leal (%edx,%ecx,4), %edx
! 120:
! 121: leal (%edi,%ecx,4), %edi
! 122: negl %ecx
! 123: shrl %eax
! 124:
! 125: C This loop in in a single 16 byte code block already, so no
! 126: C alignment necessary.
! 127: L(simple):
! 128: C eax scratch
! 129: C ebx src1
! 130: C ecx counter
! 131: C edx src2
! 132: C esi
! 133: C edi dst
! 134: C ebp
! 135:
! 136: movl (%ebx,%ecx,4), %eax
! 137: M4_inst (%edx,%ecx,4), %eax
! 138: movl %eax, (%edi,%ecx,4)
! 139: incl %ecx
! 140: jnz L(simple)
! 141:
! 142: movl $0, %eax
! 143: movl SAVE_EDI, %edi
! 144:
! 145: movl SAVE_EBX, %ebx
! 146: setc %al
! 147: addl $STACK_SPACE, %esp
! 148:
! 149: ret
! 150:
! 151:
! 152: C -----------------------------------------------------------------------------
! 153: C This is at 0x55, close enough to aligned.
! 154: L(unroll):
! 155: deflit(`FRAME',STACK_SPACE)
! 156: movl %ebp, SAVE_EBP
! 157: andl $-2, %ecx C size low bit masked out
! 158: andl $1, PARAM_SIZE C size low bit kept
! 159:
! 160: movl %ecx, %edi
! 161: decl %ecx
! 162: movl PARAM_DST, %ebp
! 163:
! 164: shrl $UNROLL_LOG2, %ecx
! 165: negl %edi
! 166: movl %esi, SAVE_ESI
! 167:
! 168: andl $UNROLL_MASK, %edi
! 169:
! 170: ifdef(`PIC',`
! 171: call L(pic_calc)
! 172: L(here):
! 173: ',`
! 174: leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
! 175: ')
! 176: negl %edi
! 177: shrl %eax
! 178:
! 179: leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
! 180: leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
! 181: leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
! 182:
! 183: jmp *%esi
! 184:
! 185:
! 186: ifdef(`PIC',`
! 187: L(pic_calc):
! 188: C See README.family about old gas bugs
! 189: leal (%edi,%edi,8), %esi
! 190: addl $L(entry)-L(here), %esi
! 191: addl (%esp), %esi
! 192: ret
! 193: ')
! 194:
! 195:
! 196: C -----------------------------------------------------------------------------
! 197: ALIGN(32)
! 198: L(top):
! 199: C eax zero
! 200: C ebx src1
! 201: C ecx counter
! 202: C edx src2
! 203: C esi scratch (was computed jump)
! 204: C edi dst
! 205: C ebp scratch
! 206:
! 207: leal UNROLL_BYTES(%edx), %edx
! 208:
! 209: L(entry):
! 210: deflit(CHUNK_COUNT, 2)
! 211: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 212: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
! 213: deflit(`disp1', eval(disp0 + 4))
! 214:
! 215: Zdisp( movl, disp0,(%ebx), %esi)
! 216: movl disp1(%ebx), %ebp
! 217: Zdisp( M4_inst,disp0,(%edx), %esi)
! 218: Zdisp( movl, %esi, disp0,(%edi))
! 219: M4_inst disp1(%edx), %ebp
! 220: movl %ebp, disp1(%edi)
! 221: ')
! 222:
! 223: decl %ecx
! 224: leal UNROLL_BYTES(%ebx), %ebx
! 225: leal UNROLL_BYTES(%edi), %edi
! 226: jns L(top)
! 227:
! 228:
! 229: mov PARAM_SIZE, %esi
! 230: movl SAVE_EBP, %ebp
! 231: movl $0, %eax
! 232:
! 233: decl %esi
! 234: js L(even)
! 235:
! 236: movl (%ebx), %ecx
! 237: M4_inst UNROLL_BYTES(%edx), %ecx
! 238: movl %ecx, (%edi)
! 239: L(even):
! 240:
! 241: movl SAVE_EDI, %edi
! 242: movl SAVE_EBX, %ebx
! 243: setc %al
! 244:
! 245: movl SAVE_ESI, %esi
! 246: addl $STACK_SPACE, %esp
! 247:
! 248: ret
! 249:
! 250: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>