Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/rshift.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_rshift -- mpn right shift.
! 2: dnl
! 3: dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: dnl K7: UNROLL_COUNT cycles/limb
! 30: dnl 4 1.51
! 31: dnl 8 1.26
! 32: dnl 16 1.21
! 33: dnl 32 1.2
! 34: dnl Maximum possible with the current code is 64.
! 35:
! 36: deflit(UNROLL_COUNT, 16)
! 37:
! 38:
! 39: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 40: C unsigned shift);
! 41: C
! 42: C Shift src,size right by shift many bits and store the result in dst,size.
! 43: C Zeros are shifted in at the left. The bits shifted out at the right are
! 44: C the return value.
! 45: C
! 46: C This code uses 64-bit MMX operations, which makes it possible to handle
! 47: C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
! 48: C code, on the other hand, suffers from shrd being a vector path decode and
! 49: C running at 3 cycles back-to-back.
! 50: C
! 51: C Full speed depends on source and destination being aligned, and some hairy
! 52: C setups and finish-ups are done to arrange this for the loop.
! 53:
! 54: ifdef(`PIC',`
! 55: deflit(UNROLL_THRESHOLD, 10)
! 56: ',`
! 57: deflit(UNROLL_THRESHOLD, 10)
! 58: ')
! 59:
! 60: defframe(PARAM_SHIFT,16)
! 61: defframe(PARAM_SIZE, 12)
! 62: defframe(PARAM_SRC, 8)
! 63: defframe(PARAM_DST, 4)
! 64:
! 65: defframe(SAVE_EDI, -4)
! 66: defframe(SAVE_ESI, -8)
! 67: defframe(SAVE_EBX, -12)
! 68: deflit(SAVE_SIZE, 12)
! 69:
! 70: .text
! 71: ALIGN(32)
! 72:
! 73: PROLOGUE(mpn_rshift)
! 74: deflit(`FRAME',0)
! 75:
! 76: movl PARAM_SIZE, %eax
! 77: movl PARAM_SRC, %edx
! 78: subl $SAVE_SIZE, %esp
! 79: deflit(`FRAME',SAVE_SIZE)
! 80:
! 81: movl PARAM_SHIFT, %ecx
! 82: movl %edi, SAVE_EDI
! 83:
! 84: movl PARAM_DST, %edi
! 85: decl %eax
! 86: jnz L(more_than_one_limb)
! 87:
! 88: movl (%edx), %edx C src limb
! 89:
! 90: shrdl( %cl, %edx, %eax) C eax was decremented to zero
! 91:
! 92: shrl %cl, %edx
! 93:
! 94: movl %edx, (%edi) C dst limb
! 95: movl SAVE_EDI, %edi
! 96: addl $SAVE_SIZE, %esp
! 97:
! 98: ret
! 99:
! 100:
! 101: C -----------------------------------------------------------------------------
! 102: L(more_than_one_limb):
! 103: C eax size-1
! 104: C ebx
! 105: C ecx shift
! 106: C edx src
! 107: C esi
! 108: C edi dst
! 109: C ebp
! 110:
! 111: movd PARAM_SHIFT, %mm6 C rshift
! 112: movd (%edx), %mm5 C src low limb
! 113: cmp $UNROLL_THRESHOLD-1, %eax
! 114:
! 115: jae L(unroll)
! 116: leal (%edx,%eax,4), %edx C &src[size-1]
! 117: leal -4(%edi,%eax,4), %edi C &dst[size-2]
! 118:
! 119: movd (%edx), %mm4 C src high limb
! 120: negl %eax
! 121:
! 122:
! 123: L(simple_top):
! 124: C eax loop counter, limbs, negative
! 125: C ebx
! 126: C ecx shift
! 127: C edx carry
! 128: C edx &src[size-1]
! 129: C edi &dst[size-2]
! 130: C ebp
! 131: C
! 132: C mm0 scratch
! 133: C mm4 src high limb
! 134: C mm5 src low limb
! 135: C mm6 shift
! 136:
! 137: movq (%edx,%eax,4), %mm0
! 138: incl %eax
! 139:
! 140: psrlq %mm6, %mm0
! 141:
! 142: movd %mm0, (%edi,%eax,4)
! 143: jnz L(simple_top)
! 144:
! 145:
! 146: psllq $32, %mm5
! 147: psrlq %mm6, %mm4
! 148:
! 149: psrlq %mm6, %mm5
! 150: movd %mm4, 4(%edi) C dst high limb
! 151:
! 152: movd %mm5, %eax C return value
! 153:
! 154: movl SAVE_EDI, %edi
! 155: addl $SAVE_SIZE, %esp
! 156: emms
! 157:
! 158: ret
! 159:
! 160:
! 161: C -----------------------------------------------------------------------------
! 162: ALIGN(16)
! 163: L(unroll):
! 164: C eax size-1
! 165: C ebx
! 166: C ecx shift
! 167: C edx src
! 168: C esi
! 169: C edi dst
! 170: C ebp
! 171: C
! 172: C mm5 src low limb
! 173: C mm6 rshift
! 174:
! 175: testb $4, %dl
! 176: movl %esi, SAVE_ESI
! 177: movl %ebx, SAVE_EBX
! 178:
! 179: psllq $32, %mm5
! 180: jz L(start_src_aligned)
! 181:
! 182:
! 183: C src isn't aligned, process low limb separately (marked xxx) and
! 184: C step src and dst by one limb, making src aligned.
! 185: C
! 186: C source edx
! 187: C --+-------+-------+-------+
! 188: C | xxx |
! 189: C --+-------+-------+-------+
! 190: C 4mod8 0mod8 4mod8
! 191: C
! 192: C dest edi
! 193: C --+-------+-------+
! 194: C | | xxx |
! 195: C --+-------+-------+
! 196:
! 197: movq (%edx), %mm0 C src low two limbs
! 198: addl $4, %edx
! 199: movl %eax, PARAM_SIZE C size-1
! 200:
! 201: addl $4, %edi
! 202: decl %eax C size-2 is new size-1
! 203:
! 204: psrlq %mm6, %mm0
! 205: movl %edi, PARAM_DST C new dst
! 206:
! 207: movd %mm0, -4(%edi)
! 208: L(start_src_aligned):
! 209:
! 210:
! 211: movq (%edx), %mm1 C src low two limbs
! 212: decl %eax C size-2, two last limbs handled at end
! 213: testl $4, %edi
! 214:
! 215: psrlq %mm6, %mm5
! 216: jz L(start_dst_aligned)
! 217:
! 218:
! 219: C dst isn't aligned, add 4 to make it so, and pretend the shift is
! 220: C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
! 221: C
! 222: C source edx
! 223: C --+-------+-------+
! 224: C | mm1 |
! 225: C --+-------+-------+
! 226: C 4mod8 0mod8
! 227: C
! 228: C dest edi
! 229: C --+-------+-------+-------+
! 230: C | xxx |
! 231: C --+-------+-------+-------+
! 232: C 4mod8 0mod8 4mod8
! 233:
! 234: movq %mm1, %mm0
! 235: psrlq %mm6, %mm1
! 236: addl $32, %ecx C shift+32
! 237:
! 238: movd %mm1, (%edi)
! 239: movq %mm0, %mm1
! 240: addl $4, %edi C new dst
! 241:
! 242: movd %ecx, %mm6
! 243: L(start_dst_aligned):
! 244:
! 245:
! 246: movq %mm1, %mm2 C copy of src low two limbs
! 247: negl %ecx
! 248: andl $-2, %eax C round size down to even
! 249:
! 250: movl %eax, %ebx
! 251: negl %eax
! 252: addl $64, %ecx
! 253:
! 254: andl $UNROLL_MASK, %eax
! 255: decl %ebx
! 256:
! 257: shll %eax
! 258:
! 259: movd %ecx, %mm7 C lshift = 64-rshift
! 260:
! 261: ifdef(`PIC',`
! 262: call L(pic_calc)
! 263: L(here):
! 264: ',`
! 265: leal L(entry) (%eax,%eax,4), %esi
! 266: negl %eax
! 267: ')
! 268: shrl $UNROLL_LOG2, %ebx C loop counter
! 269:
! 270: leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
! 271: leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
! 272: movl PARAM_SIZE, %eax C for use at end
! 273:
! 274: jmp *%esi
! 275:
! 276:
! 277: ifdef(`PIC',`
! 278: L(pic_calc):
! 279: C See README.family about old gas bugs
! 280: leal (%eax,%eax,4), %esi
! 281: addl $L(entry)-L(here), %esi
! 282: addl (%esp), %esi
! 283: negl %eax
! 284:
! 285: ret
! 286: ')
! 287:
! 288:
! 289: C -----------------------------------------------------------------------------
! 290: ALIGN(64)
! 291: L(top):
! 292: C eax size, for use at end
! 293: C ebx loop counter
! 294: C ecx lshift
! 295: C edx src
! 296: C esi was computed jump
! 297: C edi dst
! 298: C ebp
! 299: C
! 300: C mm0 scratch
! 301: C mm1 \ carry (alternating)
! 302: C mm2 /
! 303: C mm6 rshift
! 304: C mm7 lshift
! 305: C
! 306: C 10 code bytes/limb
! 307: C
! 308: C The two chunks differ in whether mm1 or mm2 hold the carry.
! 309: C The computed jump puts the initial carry in both mm1 and mm2.
! 310:
! 311: L(entry):
! 312: deflit(CHUNK_COUNT, 4)
! 313: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 314: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
! 315: deflit(`disp1', eval(disp0 + 8))
! 316:
! 317: movq disp0(%edx), %mm0
! 318: psrlq %mm6, %mm2
! 319:
! 320: movq %mm0, %mm1
! 321: psllq %mm7, %mm0
! 322:
! 323: por %mm2, %mm0
! 324: movq %mm0, disp0(%edi)
! 325:
! 326:
! 327: movq disp1(%edx), %mm0
! 328: psrlq %mm6, %mm1
! 329:
! 330: movq %mm0, %mm2
! 331: psllq %mm7, %mm0
! 332:
! 333: por %mm1, %mm0
! 334: movq %mm0, disp1(%edi)
! 335: ')
! 336:
! 337: addl $UNROLL_BYTES, %edx
! 338: addl $UNROLL_BYTES, %edi
! 339: decl %ebx
! 340:
! 341: jns L(top)
! 342:
! 343:
! 344: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
! 345: deflit(`disp1', eval(disp0-0 + 8))
! 346:
! 347: testb $1, %al
! 348: psrlq %mm6, %mm2 C wanted rshifted in all cases below
! 349: movl SAVE_ESI, %esi
! 350:
! 351: movd %mm5, %eax C return value
! 352:
! 353: movl SAVE_EBX, %ebx
! 354: jz L(end_even)
! 355:
! 356:
! 357: C Size odd, destination was aligned.
! 358: C
! 359: C source
! 360: C edx
! 361: C +-------+---------------+--
! 362: C | | mm2 |
! 363: C +-------+---------------+--
! 364: C
! 365: C dest edi
! 366: C +-------+---------------+---------------+--
! 367: C | | | written |
! 368: C +-------+---------------+---------------+--
! 369: C
! 370: C mm6 = shift
! 371: C mm7 = ecx = 64-shift
! 372:
! 373:
! 374: C Size odd, destination was unaligned.
! 375: C
! 376: C source
! 377: C edx
! 378: C +-------+---------------+--
! 379: C | | mm2 |
! 380: C +-------+---------------+--
! 381: C
! 382: C dest edi
! 383: C +---------------+---------------+--
! 384: C | | written |
! 385: C +---------------+---------------+--
! 386: C
! 387: C mm6 = shift+32
! 388: C mm7 = ecx = 64-(shift+32)
! 389:
! 390:
! 391: C In both cases there's one extra limb of src to fetch and combine
! 392: C with mm2 to make a qword to store, and in the aligned case there's
! 393: C a further extra limb of dst to be formed.
! 394:
! 395:
! 396: movd disp0(%edx), %mm0
! 397: movq %mm0, %mm1
! 398:
! 399: psllq %mm7, %mm0
! 400: testb $32, %cl
! 401:
! 402: por %mm2, %mm0
! 403: psrlq %mm6, %mm1
! 404:
! 405: movq %mm0, disp0(%edi)
! 406: jz L(finish_odd_unaligned)
! 407:
! 408: movd %mm1, disp1(%edi)
! 409: L(finish_odd_unaligned):
! 410:
! 411: movl SAVE_EDI, %edi
! 412: addl $SAVE_SIZE, %esp
! 413: emms
! 414:
! 415: ret
! 416:
! 417:
! 418: L(end_even):
! 419:
! 420: C Size even, destination was aligned.
! 421: C
! 422: C source
! 423: C +---------------+--
! 424: C | mm2 |
! 425: C +---------------+--
! 426: C
! 427: C dest edi
! 428: C +---------------+---------------+--
! 429: C | | mm3 |
! 430: C +---------------+---------------+--
! 431: C
! 432: C mm6 = shift
! 433: C mm7 = ecx = 64-shift
! 434:
! 435:
! 436: C Size even, destination was unaligned.
! 437: C
! 438: C source
! 439: C +---------------+--
! 440: C | mm2 |
! 441: C +---------------+--
! 442: C
! 443: C dest edi
! 444: C +-------+---------------+--
! 445: C | | mm3 |
! 446: C +-------+---------------+--
! 447: C
! 448: C mm6 = shift+32
! 449: C mm7 = 64-(shift+32)
! 450:
! 451:
! 452: C The movd for the unaligned case is the same data as the movq for
! 453: C the aligned case, it's just a choice between whether one or two
! 454: C limbs should be written.
! 455:
! 456:
! 457: testb $32, %cl
! 458: movd %mm2, disp0(%edi)
! 459:
! 460: jz L(end_even_unaligned)
! 461:
! 462: movq %mm2, disp0(%edi)
! 463: L(end_even_unaligned):
! 464:
! 465: movl SAVE_EDI, %edi
! 466: addl $SAVE_SIZE, %esp
! 467: emms
! 468:
! 469: ret
! 470:
! 471: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>