Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/lshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K7 mpn_lshift -- mpn left shift.
2: dnl
3: dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: dnl K7: UNROLL_COUNT cycles/limb
30: dnl 4 1.51
31: dnl 8 1.26
32: dnl 16 1.21
33: dnl 32 1.2
34: dnl Maximum possible with the current code is 64.
35:
36: deflit(UNROLL_COUNT, 16)
37:
38:
39: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40: C unsigned shift);
41: C
42: C Shift src,size left by shift many bits and store the result in dst,size.
43: C Zeros are shifted in at the right. The bits shifted out at the left are
44: C the return value.
45: C
46: C The comments in mpn_rshift apply here too.
47:
48: ifdef(`PIC',`
49: deflit(UNROLL_THRESHOLD, 10)
50: ',`
51: deflit(UNROLL_THRESHOLD, 10)
52: ')
53:
54: defframe(PARAM_SHIFT,16)
55: defframe(PARAM_SIZE, 12)
56: defframe(PARAM_SRC, 8)
57: defframe(PARAM_DST, 4)
58:
59: defframe(SAVE_EDI, -4)
60: defframe(SAVE_ESI, -8)
61: defframe(SAVE_EBX, -12)
62: deflit(SAVE_SIZE, 12)
63:
64: .text
65: ALIGN(32)
66:
67: PROLOGUE(mpn_lshift)
68: deflit(`FRAME',0)
69:
70: movl PARAM_SIZE, %eax
71: movl PARAM_SRC, %edx
72: subl $SAVE_SIZE, %esp
73: deflit(`FRAME',SAVE_SIZE)
74:
75: movl PARAM_SHIFT, %ecx
76: movl %edi, SAVE_EDI
77:
78: movl PARAM_DST, %edi
79: decl %eax
80: jnz L(more_than_one_limb)
81:
82: movl (%edx), %edx
83:
84: shldl( %cl, %edx, %eax) C eax was decremented to zero
85:
86: shll %cl, %edx
87:
88: movl %edx, (%edi)
89: movl SAVE_EDI, %edi
90: addl $SAVE_SIZE, %esp
91:
92: ret
93:
94:
95: C -----------------------------------------------------------------------------
96: L(more_than_one_limb):
97: C eax size-1
98: C ebx
99: C ecx shift
100: C edx src
101: C esi
102: C edi dst
103: C ebp
104:
105: movd PARAM_SHIFT, %mm6
106: movd (%edx,%eax,4), %mm5 C src high limb
107: cmp $UNROLL_THRESHOLD-1, %eax
108:
109: jae L(unroll)
110: negl %ecx
111: movd (%edx), %mm4 C src low limb
112:
113: addl $32, %ecx
114:
115: movd %ecx, %mm7
116:
117: L(simple_top):
118: C eax loop counter, limbs
119: C ebx
120: C ecx
121: C edx src
122: C esi
123: C edi dst
124: C ebp
125: C
126: C mm0 scratch
127: C mm4 src low limb
128: C mm5 src high limb
129: C mm6 shift
130: C mm7 32-shift
131:
132: movq -4(%edx,%eax,4), %mm0
133: decl %eax
134:
135: psrlq %mm7, %mm0
136:
137: movd %mm0, 4(%edi,%eax,4)
138: jnz L(simple_top)
139:
140:
141: psllq %mm6, %mm5
142: psllq %mm6, %mm4
143:
144: psrlq $32, %mm5
145: movd %mm4, (%edi) C dst low limb
146:
147: movd %mm5, %eax C return value
148:
149: movl SAVE_EDI, %edi
150: addl $SAVE_SIZE, %esp
151: emms
152:
153: ret
154:
155:
156: C -----------------------------------------------------------------------------
157: ALIGN(16)
158: L(unroll):
159: C eax size-1
160: C ebx (saved)
161: C ecx shift
162: C edx src
163: C esi
164: C edi dst
165: C ebp
166: C
167: C mm5 src high limb, for return value
168: C mm6 lshift
169:
170: movl %esi, SAVE_ESI
171: movl %ebx, SAVE_EBX
172: leal -4(%edx,%eax,4), %edx C &src[size-2]
173:
174: testb $4, %dl
175: movq (%edx), %mm1 C src high qword
176:
177: jz L(start_src_aligned)
178:
179:
180: C src isn't aligned, process high limb (marked xxx) separately to
181: C make it so
182: C
183: C source -4(edx,%eax,4)
184: C |
185: C +-------+-------+-------+--
186: C | xxx |
187: C +-------+-------+-------+--
188: C 0mod8 4mod8 0mod8
189: C
190: C dest -4(edi,%eax,4)
191: C |
192: C +-------+-------+--
193: C | xxx | |
194: C +-------+-------+--
195:
196: psllq %mm6, %mm1
197: subl $4, %edx
198: movl %eax, PARAM_SIZE C size-1
199:
200: psrlq $32, %mm1
201: decl %eax C size-2 is new size-1
202:
203: movd %mm1, 4(%edi,%eax,4)
204: movq (%edx), %mm1 C new src high qword
205: L(start_src_aligned):
206:
207:
208: leal -4(%edi,%eax,4), %edi C &dst[size-2]
209: psllq %mm6, %mm5
210:
211: testl $4, %edi
212: psrlq $32, %mm5 C return value
213:
214: jz L(start_dst_aligned)
215:
216:
217: C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
218: C shift is 32 bits extra. High limb of dst (marked xxx) handled
219: C here separately.
220: C
221: C source %edx
222: C +-------+-------+--
223: C | mm1 |
224: C +-------+-------+--
225: C 0mod8 4mod8
226: C
227: C dest %edi
228: C +-------+-------+-------+--
229: C | xxx |
230: C +-------+-------+-------+--
231: C 0mod8 4mod8 0mod8
232:
233: movq %mm1, %mm0
234: psllq %mm6, %mm1
235: addl $32, %ecx C shift+32
236:
237: psrlq $32, %mm1
238:
239: movd %mm1, 4(%edi)
240: movq %mm0, %mm1
241: subl $4, %edi
242:
243: movd %ecx, %mm6 C new lshift
244: L(start_dst_aligned):
245:
246: decl %eax C size-2, two last limbs handled at end
247: movq %mm1, %mm2 C copy of src high qword
248: negl %ecx
249:
250: andl $-2, %eax C round size down to even
251: addl $64, %ecx
252:
253: movl %eax, %ebx
254: negl %eax
255:
256: andl $UNROLL_MASK, %eax
257: decl %ebx
258:
259: shll %eax
260:
261: movd %ecx, %mm7 C rshift = 64-lshift
262:
263: ifdef(`PIC',`
264: call L(pic_calc)
265: L(here):
266: ',`
267: leal L(entry) (%eax,%eax,4), %esi
268: ')
269: shrl $UNROLL_LOG2, %ebx C loop counter
270:
271: leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
272: leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
273: movl PARAM_SIZE, %eax C for use at end
274: jmp *%esi
275:
276:
277: ifdef(`PIC',`
278: L(pic_calc):
279: C See README.family about old gas bugs
280: leal (%eax,%eax,4), %esi
281: addl $L(entry)-L(here), %esi
282: addl (%esp), %esi
283:
284: ret
285: ')
286:
287:
288: C -----------------------------------------------------------------------------
289: ALIGN(32)
290: L(top):
291: C eax size (for use at end)
292: C ebx loop counter
293: C ecx rshift
294: C edx src
295: C esi computed jump
296: C edi dst
297: C ebp
298: C
299: C mm0 scratch
300: C mm1 \ carry (alternating, mm2 first)
301: C mm2 /
302: C mm6 lshift
303: C mm7 rshift
304: C
305: C 10 code bytes/limb
306: C
307: C The two chunks differ in whether mm1 or mm2 hold the carry.
308: C The computed jump puts the initial carry in both mm1 and mm2.
309:
310: L(entry):
311: deflit(CHUNK_COUNT, 4)
312: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
313: deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
314: deflit(`disp1', eval(disp0 - 8))
315:
316: movq disp0(%edx), %mm0
317: psllq %mm6, %mm2
318:
319: movq %mm0, %mm1
320: psrlq %mm7, %mm0
321:
322: por %mm2, %mm0
323: movq %mm0, disp0(%edi)
324:
325:
326: movq disp1(%edx), %mm0
327: psllq %mm6, %mm1
328:
329: movq %mm0, %mm2
330: psrlq %mm7, %mm0
331:
332: por %mm1, %mm0
333: movq %mm0, disp1(%edi)
334: ')
335:
336: subl $UNROLL_BYTES, %edx
337: subl $UNROLL_BYTES, %edi
338: decl %ebx
339:
340: jns L(top)
341:
342:
343:
344: define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
345:
346: L(end):
347: testb $1, %al
348: movl SAVE_EBX, %ebx
349: psllq %mm6, %mm2 C wanted left shifted in all cases below
350:
351: movd %mm5, %eax
352:
353: movl SAVE_ESI, %esi
354: jz L(end_even)
355:
356:
357: L(end_odd):
358:
359: C Size odd, destination was aligned.
360: C
361: C source edx+8 edx+4
362: C --+---------------+-------+
363: C | mm2 | |
364: C --+---------------+-------+
365: C
366: C dest edi
367: C --+---------------+---------------+-------+
368: C | written | | |
369: C --+---------------+---------------+-------+
370: C
371: C mm6 = shift
372: C mm7 = ecx = 64-shift
373:
374:
375: C Size odd, destination was unaligned.
376: C
377: C source edx+8 edx+4
378: C --+---------------+-------+
379: C | mm2 | |
380: C --+---------------+-------+
381: C
382: C dest edi
383: C --+---------------+---------------+
384: C | written | |
385: C --+---------------+---------------+
386: C
387: C mm6 = shift+32
388: C mm7 = ecx = 64-(shift+32)
389:
390:
391: C In both cases there's one extra limb of src to fetch and combine
392: C with mm2 to make a qword at (%edi), and in the aligned case
393: C there's an extra limb of dst to be formed from that extra src limb
394: C left shifted.
395:
396: movd disp(4) (%edx), %mm0
397: testb $32, %cl
398:
399: movq %mm0, %mm1
400: psllq $32, %mm0
401:
402: psrlq %mm7, %mm0
403: psllq %mm6, %mm1
404:
405: por %mm2, %mm0
406:
407: movq %mm0, disp(0) (%edi)
408: jz L(end_odd_unaligned)
409: movd %mm1, disp(-4) (%edi)
410: L(end_odd_unaligned):
411:
412: movl SAVE_EDI, %edi
413: addl $SAVE_SIZE, %esp
414: emms
415:
416: ret
417:
418:
419: L(end_even):
420:
421: C Size even, destination was aligned.
422: C
423: C source edx+8
424: C --+---------------+
425: C | mm2 |
426: C --+---------------+
427: C
428: C dest edi
429: C --+---------------+---------------+
430: C | written | |
431: C --+---------------+---------------+
432: C
433: C mm6 = shift
434: C mm7 = ecx = 64-shift
435:
436:
437: C Size even, destination was unaligned.
438: C
439: C source edx+8
440: C --+---------------+
441: C | mm2 |
442: C --+---------------+
443: C
444: C dest edi+4
445: C --+---------------+-------+
446: C | written | |
447: C --+---------------+-------+
448: C
449: C mm6 = shift+32
450: C mm7 = ecx = 64-(shift+32)
451:
452:
453: C The movq for the aligned case overwrites the movd for the
454: C unaligned case.
455:
456: movq %mm2, %mm0
457: psrlq $32, %mm2
458:
459: testb $32, %cl
460: movd %mm2, disp(4) (%edi)
461:
462: jz L(end_even_unaligned)
463: movq %mm0, disp(0) (%edi)
464: L(end_even_unaligned):
465:
466: movl SAVE_EDI, %edi
467: addl $SAVE_SIZE, %esp
468: emms
469:
470: ret
471:
472: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>