Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aorsmul_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
! 2: dnl
! 3: dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
! 4: dnl PIC adds about 6 cycles at the start.
! 5:
! 6:
! 7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 8: dnl
! 9: dnl This file is part of the GNU MP Library.
! 10: dnl
! 11: dnl The GNU MP Library is free software; you can redistribute it and/or
! 12: dnl modify it under the terms of the GNU Lesser General Public License as
! 13: dnl published by the Free Software Foundation; either version 2.1 of the
! 14: dnl License, or (at your option) any later version.
! 15: dnl
! 16: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 19: dnl Lesser General Public License for more details.
! 20: dnl
! 21: dnl You should have received a copy of the GNU Lesser General Public
! 22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 24: dnl Suite 330, Boston, MA 02111-1307, USA.
! 25:
! 26:
! 27: include(`../config.m4')
! 28:
! 29:
! 30: dnl K6: large multpliers small multpliers
! 31: dnl UNROLL_COUNT cycles/limb cycles/limb
! 32: dnl 4 9.5 7.78
! 33: dnl 8 9.0 7.78
! 34: dnl 16 8.4 7.65
! 35: dnl 32 8.4 8.2
! 36: dnl
! 37: dnl Maximum possible unrolling with the current code is 32.
! 38: dnl
! 39: dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
! 40: dnl byte block, which might explain the good speed at that unrolling.
! 41:
! 42: deflit(UNROLL_COUNT, 16)
! 43:
! 44:
! 45: ifdef(`OPERATION_addmul_1', `
! 46: define(M4_inst, addl)
! 47: define(M4_function_1, mpn_addmul_1)
! 48: define(M4_function_1c, mpn_addmul_1c)
! 49: define(M4_description, add it to)
! 50: define(M4_desc_retval, carry)
! 51: ',`ifdef(`OPERATION_submul_1', `
! 52: define(M4_inst, subl)
! 53: define(M4_function_1, mpn_submul_1)
! 54: define(M4_function_1c, mpn_submul_1c)
! 55: define(M4_description, subtract it from)
! 56: define(M4_desc_retval, borrow)
! 57: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
! 58: ')')')
! 59:
! 60: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
! 61:
! 62:
! 63: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 64: C mp_limb_t mult);
! 65: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 66: C mp_limb_t mult, mp_limb_t carry);
! 67: C
! 68: C Calculate src,size multiplied by mult and M4_description dst,size.
! 69: C Return the M4_desc_retval limb from the top of the result.
! 70: C
! 71: C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
! 72: C multipliers (most significant few bits clear) result in few carry bits and
! 73: C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
! 74: C significant few bits set) make the carry bits 50/50 and lead to something
! 75: C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
! 76: C
! 77: C It's important that the gains for jadcl0 on small multipliers don't come
! 78: C at the cost of slowing down other data. Tests on uniformly distributed
! 79: C random data, designed to confound branch prediction, show about a 7%
! 80: C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
! 81: C overheads included).
! 82: C
! 83: C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
! 84: C 11.0 cycles/limb), and hence isn't used.
! 85: C
! 86: C In the simple loop, note that running ecx from negative to zero and using
! 87: C it as an index in the two movs wouldn't help. It would save one
! 88: C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
! 89: C that would be collapsed by this.
! 90: C
! 91: C
! 92: C jadcl0
! 93: C ------
! 94: C
! 95: C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
! 96: C firstly the instruction decoding and secondly the fact that there's a
! 97: C carry bit for the jadcl0 only on average about 1/4 of the time.
! 98: C
! 99: C The code in the unrolled loop decodes something like the following.
! 100: C
! 101: C decode cycles
! 102: C mull %ebp 2
! 103: C M4_inst %esi, disp(%edi) 1
! 104: C adcl %eax, %ecx 2
! 105: C movl %edx, %esi \ 1
! 106: C jnc 1f /
! 107: C incl %esi \ 1
! 108: C 1: movl disp(%ebx), %eax /
! 109: C ---
! 110: C 7
! 111: C
! 112: C In a back-to-back style test this measures 7 with the jnc not taken, or 8
! 113: C with it taken (both when correctly predicted). This is opposite to the
! 114: C measurements showing small multipliers running faster than large ones.
! 115: C Watch this space for more info ...
! 116: C
! 117: C It's not clear how much branch misprediction might be costing. The K6
! 118: C doco says it will be 1 to 4 cycles, but presumably it's near the low end
! 119: C of that range to get the measured results.
! 120: C
! 121: C
! 122: C In the code the two carries are more or less the preceding mul product and
! 123: C the calculation is roughly
! 124: C
! 125: C x*y + u*b+v
! 126: C
! 127: C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
! 128: C v are the two limbs it's added to (being the low of the next mul, and a
! 129: C limb from the destination).
! 130: C
! 131: C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
! 132: C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
! 133: C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
! 134: C and b-1, then the total probability can be summed over x and y,
! 135: C
! 136: C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
! 137: C --- * sum sum --- = --- * ------- * ------- = 1/4
! 138: C b^2 x=0 y=1 b^2 b^4 2 2
! 139: C
! 140: C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
! 141: C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
! 142:
! 143:
! 144: ifdef(`PIC',`
! 145: deflit(UNROLL_THRESHOLD, 9)
! 146: ',`
! 147: deflit(UNROLL_THRESHOLD, 6)
! 148: ')
! 149:
! 150: defframe(PARAM_CARRY, 20)
! 151: defframe(PARAM_MULTIPLIER,16)
! 152: defframe(PARAM_SIZE, 12)
! 153: defframe(PARAM_SRC, 8)
! 154: defframe(PARAM_DST, 4)
! 155:
! 156: .text
! 157: ALIGN(32)
! 158:
! 159: PROLOGUE(M4_function_1c)
! 160: pushl %esi
! 161: deflit(`FRAME',4)
! 162: movl PARAM_CARRY, %esi
! 163: jmp LF(M4_function_1,start_nc)
! 164: EPILOGUE()
! 165:
! 166: PROLOGUE(M4_function_1)
! 167: push %esi
! 168: deflit(`FRAME',4)
! 169: xorl %esi, %esi C initial carry
! 170:
! 171: L(start_nc):
! 172: movl PARAM_SIZE, %ecx
! 173: pushl %ebx
! 174: deflit(`FRAME',8)
! 175:
! 176: movl PARAM_SRC, %ebx
! 177: pushl %edi
! 178: deflit(`FRAME',12)
! 179:
! 180: cmpl $UNROLL_THRESHOLD, %ecx
! 181: movl PARAM_DST, %edi
! 182:
! 183: pushl %ebp
! 184: deflit(`FRAME',16)
! 185: jae L(unroll)
! 186:
! 187:
! 188: C simple loop
! 189:
! 190: movl PARAM_MULTIPLIER, %ebp
! 191:
! 192: L(simple):
! 193: C eax scratch
! 194: C ebx src
! 195: C ecx counter
! 196: C edx scratch
! 197: C esi carry
! 198: C edi dst
! 199: C ebp multiplier
! 200:
! 201: movl (%ebx), %eax
! 202: addl $4, %ebx
! 203:
! 204: mull %ebp
! 205:
! 206: addl $4, %edi
! 207: addl %esi, %eax
! 208:
! 209: adcl $0, %edx
! 210:
! 211: M4_inst %eax, -4(%edi)
! 212:
! 213: adcl $0, %edx
! 214:
! 215: movl %edx, %esi
! 216: loop L(simple)
! 217:
! 218:
! 219: popl %ebp
! 220: popl %edi
! 221:
! 222: popl %ebx
! 223: movl %esi, %eax
! 224:
! 225: popl %esi
! 226: ret
! 227:
! 228:
! 229:
! 230: C -----------------------------------------------------------------------------
! 231: C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
! 232: C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
! 233: C For the computed jump an odd size means they start one way around, an even
! 234: C size the other.
! 235: C
! 236: C VAR_JUMP holds the computed jump temporarily because there's not enough
! 237: C registers at the point of doing the mul for the initial two carry limbs.
! 238: C
! 239: C The add/adc for the initial carry in %esi is necessary only for the
! 240: C mpn_addmul/submul_1c entry points. Duplicating the startup code to
! 241: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
! 242: C idea.
! 243:
! 244: dnl overlapping with parameters already fetched
! 245: define(VAR_COUNTER, `PARAM_SIZE')
! 246: define(VAR_JUMP, `PARAM_DST')
! 247:
! 248: L(unroll):
! 249: C eax
! 250: C ebx src
! 251: C ecx size
! 252: C edx
! 253: C esi initial carry
! 254: C edi dst
! 255: C ebp
! 256:
! 257: movl %ecx, %edx
! 258: decl %ecx
! 259:
! 260: subl $2, %edx
! 261: negl %ecx
! 262:
! 263: shrl $UNROLL_LOG2, %edx
! 264: andl $UNROLL_MASK, %ecx
! 265:
! 266: movl %edx, VAR_COUNTER
! 267: movl %ecx, %edx
! 268:
! 269: shll $4, %edx
! 270: negl %ecx
! 271:
! 272: C 15 code bytes per limb
! 273: ifdef(`PIC',`
! 274: call L(pic_calc)
! 275: L(here):
! 276: ',`
! 277: leal L(entry) (%edx,%ecx,1), %edx
! 278: ')
! 279: movl (%ebx), %eax C src low limb
! 280:
! 281: movl PARAM_MULTIPLIER, %ebp
! 282: movl %edx, VAR_JUMP
! 283:
! 284: mull %ebp
! 285:
! 286: addl %esi, %eax C initial carry (from _1c)
! 287: jadcl0( %edx)
! 288:
! 289:
! 290: leal 4(%ebx,%ecx,4), %ebx
! 291: movl %edx, %esi C high carry
! 292:
! 293: movl VAR_JUMP, %edx
! 294: leal (%edi,%ecx,4), %edi
! 295:
! 296: testl $1, %ecx
! 297: movl %eax, %ecx C low carry
! 298:
! 299: jz L(noswap)
! 300: movl %esi, %ecx C high,low carry other way around
! 301:
! 302: movl %eax, %esi
! 303: L(noswap):
! 304:
! 305: jmp *%edx
! 306:
! 307:
! 308: ifdef(`PIC',`
! 309: L(pic_calc):
! 310: C See README.family about old gas bugs
! 311: leal (%edx,%ecx,1), %edx
! 312: addl $L(entry)-L(here), %edx
! 313: addl (%esp), %edx
! 314: ret
! 315: ')
! 316:
! 317:
! 318: C -----------------------------------------------------------
! 319: ALIGN(32)
! 320: L(top):
! 321: deflit(`FRAME',16)
! 322: C eax scratch
! 323: C ebx src
! 324: C ecx carry lo
! 325: C edx scratch
! 326: C esi carry hi
! 327: C edi dst
! 328: C ebp multiplier
! 329: C
! 330: C 15 code bytes per limb
! 331:
! 332: leal UNROLL_BYTES(%edi), %edi
! 333:
! 334: L(entry):
! 335: forloop(`i', 0, UNROLL_COUNT/2-1, `
! 336: deflit(`disp0', eval(2*i*4))
! 337: deflit(`disp1', eval(disp0 + 4))
! 338:
! 339: Zdisp( movl, disp0,(%ebx), %eax)
! 340: mull %ebp
! 341: Zdisp( M4_inst,%ecx, disp0,(%edi))
! 342: adcl %eax, %esi
! 343: movl %edx, %ecx
! 344: jadcl0( %ecx)
! 345:
! 346: movl disp1(%ebx), %eax
! 347: mull %ebp
! 348: M4_inst %esi, disp1(%edi)
! 349: adcl %eax, %ecx
! 350: movl %edx, %esi
! 351: jadcl0( %esi)
! 352: ')
! 353:
! 354: decl VAR_COUNTER
! 355: leal UNROLL_BYTES(%ebx), %ebx
! 356:
! 357: jns L(top)
! 358:
! 359:
! 360: popl %ebp
! 361: M4_inst %ecx, UNROLL_BYTES(%edi)
! 362:
! 363: popl %edi
! 364: movl %esi, %eax
! 365:
! 366: popl %ebx
! 367: jadcl0( %eax)
! 368:
! 369: popl %esi
! 370: ret
! 371:
! 372: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>