Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aorsmul_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
! 2: dnl
! 3: dnl K7: 3.9 cycles/limb.
! 4: dnl
! 5: dnl Future: It should be possible to avoid the separate mul after the
! 6: dnl unrolled loop by moving the movl/adcl to the top.
! 7:
! 8:
! 9: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 10: dnl
! 11: dnl This file is part of the GNU MP Library.
! 12: dnl
! 13: dnl The GNU MP Library is free software; you can redistribute it and/or
! 14: dnl modify it under the terms of the GNU Lesser General Public License as
! 15: dnl published by the Free Software Foundation; either version 2.1 of the
! 16: dnl License, or (at your option) any later version.
! 17: dnl
! 18: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 19: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 20: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 21: dnl Lesser General Public License for more details.
! 22: dnl
! 23: dnl You should have received a copy of the GNU Lesser General Public
! 24: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 25: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 26: dnl Suite 330, Boston, MA 02111-1307, USA.
! 27:
! 28:
! 29: include(`../config.m4')
! 30:
! 31:
! 32: dnl K7: UNROLL_COUNT cycles/limb
! 33: dnl 4 4.42
! 34: dnl 8 4.16
! 35: dnl 16 3.9
! 36: dnl 32 3.9
! 37: dnl 64 3.87
! 38: dnl Maximum possible with the current code is 64.
! 39:
! 40: deflit(UNROLL_COUNT, 16)
! 41:
! 42:
! 43: ifdef(`OPERATION_addmul_1',`
! 44: define(M4_inst, addl)
! 45: define(M4_function_1, mpn_addmul_1)
! 46: define(M4_function_1c, mpn_addmul_1c)
! 47: define(M4_description, add it to)
! 48: define(M4_desc_retval, carry)
! 49: ',`ifdef(`OPERATION_submul_1',`
! 50: define(M4_inst, subl)
! 51: define(M4_function_1, mpn_submul_1)
! 52: define(M4_function_1c, mpn_submul_1c)
! 53: define(M4_description, subtract it from)
! 54: define(M4_desc_retval, borrow)
! 55: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
! 56: ')')')
! 57:
! 58: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
! 59:
! 60:
! 61: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 62: C mp_limb_t mult);
! 63: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 64: C mp_limb_t mult, mp_limb_t carry);
! 65: C
! 66: C Calculate src,size multiplied by mult and M4_description dst,size.
! 67: C Return the M4_desc_retval limb from the top of the result.
! 68:
! 69: ifdef(`PIC',`
! 70: deflit(UNROLL_THRESHOLD, 9)
! 71: ',`
! 72: deflit(UNROLL_THRESHOLD, 6)
! 73: ')
! 74:
! 75: defframe(PARAM_CARRY, 20)
! 76: defframe(PARAM_MULTIPLIER,16)
! 77: defframe(PARAM_SIZE, 12)
! 78: defframe(PARAM_SRC, 8)
! 79: defframe(PARAM_DST, 4)
! 80: deflit(`FRAME',0)
! 81:
! 82: defframe(SAVE_EBX, -4)
! 83: defframe(SAVE_ESI, -8)
! 84: defframe(SAVE_EDI, -12)
! 85: defframe(SAVE_EBP, -16)
! 86: deflit(SAVE_SIZE, 16)
! 87:
! 88: .text
! 89: ALIGN(32)
! 90: PROLOGUE(M4_function_1)
! 91: movl PARAM_SIZE, %edx
! 92: movl PARAM_SRC, %eax
! 93: xorl %ecx, %ecx
! 94:
! 95: decl %edx
! 96: jnz LF(M4_function_1c,start_1)
! 97:
! 98: movl (%eax), %eax
! 99: movl PARAM_DST, %ecx
! 100:
! 101: mull PARAM_MULTIPLIER
! 102:
! 103: M4_inst %eax, (%ecx)
! 104: adcl $0, %edx
! 105: movl %edx, %eax
! 106:
! 107: ret
! 108: EPILOGUE()
! 109:
! 110: ALIGN(16)
! 111: PROLOGUE(M4_function_1c)
! 112: movl PARAM_SIZE, %edx
! 113: movl PARAM_SRC, %eax
! 114:
! 115: decl %edx
! 116: jnz L(more_than_one_limb)
! 117:
! 118: movl (%eax), %eax
! 119: movl PARAM_DST, %ecx
! 120:
! 121: mull PARAM_MULTIPLIER
! 122:
! 123: addl PARAM_CARRY, %eax
! 124:
! 125: adcl $0, %edx
! 126: M4_inst %eax, (%ecx)
! 127:
! 128: adcl $0, %edx
! 129: movl %edx, %eax
! 130:
! 131: ret
! 132:
! 133:
! 134: C offset 0x44 so close enough to aligned
! 135: L(more_than_one_limb):
! 136: movl PARAM_CARRY, %ecx
! 137: L(start_1):
! 138: C eax src
! 139: C ecx initial carry
! 140: C edx size-1
! 141: subl $SAVE_SIZE, %esp
! 142: deflit(`FRAME',16)
! 143:
! 144: movl %ebx, SAVE_EBX
! 145: movl %esi, SAVE_ESI
! 146: movl %edx, %ebx C size-1
! 147:
! 148: movl PARAM_SRC, %esi
! 149: movl %ebp, SAVE_EBP
! 150: cmpl $UNROLL_THRESHOLD, %edx
! 151:
! 152: movl PARAM_MULTIPLIER, %ebp
! 153: movl %edi, SAVE_EDI
! 154:
! 155: movl (%esi), %eax C src low limb
! 156: movl PARAM_DST, %edi
! 157: ja L(unroll)
! 158:
! 159:
! 160: C simple loop
! 161:
! 162: leal 4(%esi,%ebx,4), %esi C point one limb past last
! 163: leal (%edi,%ebx,4), %edi C point at last limb
! 164: negl %ebx
! 165:
! 166: C The movl to load the next source limb is done well ahead of the
! 167: C mul. This is necessary for full speed, and leads to one limb
! 168: C handled separately at the end.
! 169:
! 170: L(simple):
! 171: C eax src limb
! 172: C ebx loop counter
! 173: C ecx carry limb
! 174: C edx scratch
! 175: C esi src
! 176: C edi dst
! 177: C ebp multiplier
! 178:
! 179: mull %ebp
! 180:
! 181: addl %eax, %ecx
! 182: adcl $0, %edx
! 183:
! 184: M4_inst %ecx, (%edi,%ebx,4)
! 185: movl (%esi,%ebx,4), %eax
! 186: adcl $0, %edx
! 187:
! 188: incl %ebx
! 189: movl %edx, %ecx
! 190: jnz L(simple)
! 191:
! 192:
! 193: mull %ebp
! 194:
! 195: movl SAVE_EBX, %ebx
! 196: movl SAVE_ESI, %esi
! 197: movl SAVE_EBP, %ebp
! 198:
! 199: addl %eax, %ecx
! 200: adcl $0, %edx
! 201:
! 202: M4_inst %ecx, (%edi)
! 203: adcl $0, %edx
! 204: movl SAVE_EDI, %edi
! 205:
! 206: addl $SAVE_SIZE, %esp
! 207: movl %edx, %eax
! 208: ret
! 209:
! 210:
! 211:
! 212: C -----------------------------------------------------------------------------
! 213: ALIGN(16)
! 214: L(unroll):
! 215: C eax src low limb
! 216: C ebx size-1
! 217: C ecx carry
! 218: C edx size-1
! 219: C esi src
! 220: C edi dst
! 221: C ebp multiplier
! 222:
! 223: dnl overlapping with parameters no longer needed
! 224: define(VAR_COUNTER,`PARAM_SIZE')
! 225: define(VAR_JUMP, `PARAM_MULTIPLIER')
! 226:
! 227: subl $2, %ebx C (size-2)-1
! 228: decl %edx C size-2
! 229:
! 230: shrl $UNROLL_LOG2, %ebx
! 231: negl %edx
! 232:
! 233: movl %ebx, VAR_COUNTER
! 234: andl $UNROLL_MASK, %edx
! 235:
! 236: movl %edx, %ebx
! 237: shll $4, %edx
! 238:
! 239: ifdef(`PIC',`
! 240: call L(pic_calc)
! 241: L(here):
! 242: ',`
! 243: leal L(entry) (%edx,%ebx,1), %edx
! 244: ')
! 245: negl %ebx
! 246: movl %edx, VAR_JUMP
! 247:
! 248: mull %ebp
! 249:
! 250: addl %eax, %ecx C initial carry, becomes low carry
! 251: adcl $0, %edx
! 252: testb $1, %bl
! 253:
! 254: movl 4(%esi), %eax C src second limb
! 255: leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
! 256: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
! 257:
! 258: movl %edx, %ebx C high carry
! 259: cmovnz( %ecx, %ebx) C high,low carry other way around
! 260: cmovnz( %edx, %ecx)
! 261:
! 262: jmp *VAR_JUMP
! 263:
! 264:
! 265: ifdef(`PIC',`
! 266: L(pic_calc):
! 267: C See README.family about old gas bugs
! 268: leal (%edx,%ebx,1), %edx
! 269: addl $L(entry)-L(here), %edx
! 270: addl (%esp), %edx
! 271: ret
! 272: ')
! 273:
! 274:
! 275: C -----------------------------------------------------------------------------
! 276: C This code uses a "two carry limbs" scheme. At the top of the loop the
! 277: C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
! 278: C the computed jump an odd size means they start one way around, an even
! 279: C size the other. Either way one limb is handled separately at the start of
! 280: C the loop.
! 281: C
! 282: C The positioning of the movl to load the next source limb is important.
! 283: C Moving it after the adcl with a view to avoiding a separate mul at the end
! 284: C of the loop slows the code down.
! 285:
! 286: ALIGN(32)
! 287: L(top):
! 288: C eax src limb
! 289: C ebx carry high
! 290: C ecx carry low
! 291: C edx scratch
! 292: C esi src+8
! 293: C edi dst
! 294: C ebp multiplier
! 295: C
! 296: C VAR_COUNTER loop counter
! 297: C
! 298: C 17 bytes each limb
! 299:
! 300: L(entry):
! 301: deflit(CHUNK_COUNT,2)
! 302: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 303: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
! 304: deflit(`disp1', eval(disp0 + 4))
! 305:
! 306: mull %ebp
! 307:
! 308: Zdisp( M4_inst,%ecx, disp0,(%edi))
! 309: movl $0, %ecx
! 310:
! 311: adcl %eax, %ebx
! 312:
! 313: Zdisp( movl, disp0,(%esi), %eax)
! 314: adcl %edx, %ecx
! 315:
! 316:
! 317: mull %ebp
! 318:
! 319: M4_inst %ebx, disp1(%edi)
! 320: movl $0, %ebx
! 321:
! 322: adcl %eax, %ecx
! 323:
! 324: movl disp1(%esi), %eax
! 325: adcl %edx, %ebx
! 326: ')
! 327:
! 328: decl VAR_COUNTER
! 329: leal UNROLL_BYTES(%esi), %esi
! 330: leal UNROLL_BYTES(%edi), %edi
! 331:
! 332: jns L(top)
! 333:
! 334:
! 335: C eax src limb
! 336: C ebx carry high
! 337: C ecx carry low
! 338: C edx
! 339: C esi
! 340: C edi dst (points at second last limb)
! 341: C ebp multiplier
! 342: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
! 343: deflit(`disp1', eval(disp0-0 + 4))
! 344:
! 345: mull %ebp
! 346:
! 347: M4_inst %ecx, disp0(%edi)
! 348: movl SAVE_EBP, %ebp
! 349:
! 350: adcl %ebx, %eax
! 351: movl SAVE_EBX, %ebx
! 352: movl SAVE_ESI, %esi
! 353:
! 354: adcl $0, %edx
! 355: M4_inst %eax, disp1(%edi)
! 356: movl SAVE_EDI, %edi
! 357:
! 358: adcl $0, %edx
! 359: addl $SAVE_SIZE, %esp
! 360:
! 361: movl %edx, %eax
! 362: ret
! 363:
! 364: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>