Annotation of OpenXM_contrib/gmp/mpn/x86/x86-defs.m4, Revision 1.1.1.2
1.1 maekawa 1: divert(-1)
2:
3: dnl m4 macros for x86 assembler.
4:
5:
1.1.1.2 ! ohara 6: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: dnl Notes:
27: dnl
28: dnl m4 isn't perfect for processing BSD style x86 assembler code, the main
29: dnl problems are,
30: dnl
31: dnl 1. Doing define(foo,123) and then using foo in an addressing mode like
32: dnl foo(%ebx) expands as a macro rather than a constant. This is worked
33: dnl around by using deflit() from asm-defs.m4, instead of define().
34: dnl
35: dnl 2. Immediates in macro definitions need a space or `' to stop the $
36: dnl looking like a macro parameter. For example,
37: dnl
38: dnl define(foo, `mov $ 123, %eax')
39: dnl
40: dnl This is only a problem in macro definitions, not in ordinary text,
1.1.1.2 ! ohara 41: dnl and not in macro parameters like text passed to forloop() or ifdef().
1.1 maekawa 42:
43:
44: deflit(BYTES_PER_MP_LIMB, 4)
45:
46:
1.1.1.2 ! ohara 47: dnl Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL. We
! 48: dnl undefine PIC since we don't need to be position independent in this
! 49: dnl case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc.
1.1 maekawa 50:
1.1.1.2 ! ohara 51: ifdef(`DLL_EXPORT',`undefine(`PIC')')
1.1 maekawa 52:
53:
1.1.1.2 ! ohara 54: dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
1.1 maekawa 55: dnl
1.1.1.2 ! ohara 56: dnl In the x86 code we use explicit TEXT and ALIGN() calls in the code,
! 57: dnl since different alignments are wanted in various circumstances. So for
! 58: dnl instance,
1.1 maekawa 59: dnl
1.1.1.2 ! ohara 60: dnl TEXT
! 61: dnl ALIGN(16)
! 62: dnl PROLOGUE(mpn_add_n)
! 63: dnl ...
! 64: dnl EPILOGUE()
1.1 maekawa 65:
1.1.1.2 ! ohara 66: define(`PROLOGUE_cpu',
1.1 maekawa 67: m4_assert_numargs(1)
1.1.1.2 ! ohara 68: `GLOBL $1
! 69: TYPE($1,`function')
! 70: $1:
! 71: ifelse(WANT_PROFILING,`no',,`call_mcount
! 72: ')')
1.1 maekawa 73:
74:
1.1.1.2 ! ohara 75: dnl Usage: call_mcount
1.1 maekawa 76: dnl
1.1.1.2 ! ohara 77: dnl For `gprof' style profiling, %ebp is setup as a frame pointer. None of
! 78: dnl the assembler routines use %ebp this way, so it's done only for the
! 79: dnl benefit of mcount. glibc sysdeps/i386/i386-mcount.S shows how mcount
! 80: dnl gets the current function from (%esp) and the parent from 4(%ebp).
1.1 maekawa 81: dnl
1.1.1.2 ! ohara 82: dnl For `prof' style profiling gcc generates mcount calls without setting
! 83: dnl up %ebp, and the same is done here.
1.1 maekawa 84:
1.1.1.2 ! ohara 85: define(`call_mcount',
! 86: m4_assert_numargs(-1)
! 87: m4_assert_defined(`WANT_PROFILING')
! 88: m4_assert_defined(`MCOUNT_PIC_REG')
! 89: m4_assert_defined(`MCOUNT_NONPIC_REG')
! 90: m4_assert_defined(`MCOUNT_PIC_CALL')
! 91: m4_assert_defined(`MCOUNT_NONPIC_CALL')
! 92: `ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,,
! 93: ` DATA
! 94: ALIGN(4)
! 95: L(mcount_data_`'mcount_data_counter):
! 96: W32 0
! 97: TEXT
! 98: ')dnl
! 99: ifelse(WANT_PROFILING,`gprof',
! 100: ` pushl %ebp
! 101: movl %esp, %ebp
! 102: ')dnl
! 103: ifdef(`PIC',
! 104: ` pushl %ebx
! 105: mcount_movl_GOT_ebx
! 106: ifelse(MCOUNT_PIC_REG,,,
! 107: ` leal L(mcount_data_`'mcount_data_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG')
! 108: MCOUNT_PIC_CALL
! 109: popl %ebx
! 110: ',`dnl non-PIC
! 111: ifelse(MCOUNT_NONPIC_REG,,,
! 112: ` movl `$'L(mcount_data_`'mcount_data_counter), MCOUNT_NONPIC_REG
! 113: ')dnl
! 114: MCOUNT_NONPIC_CALL
! 115: ')dnl
! 116: ifelse(WANT_PROFILING,`gprof',
! 117: ` popl %ebp
! 118: ')
! 119: define(`mcount_data_counter',eval(mcount_data_counter+1))')
! 120:
! 121: define(mcount_data_counter,1)
! 122:
! 123: dnl Called: mcount_movl_GOT_ebx
! 124: dnl Label H is "here", the %eip obtained from the call. C is the called
! 125: dnl subroutine. J is the jump across that subroutine. A fetch and "ret"
! 126: dnl is always done so calls and returns are balanced for the benefit of the
! 127: dnl various x86s that have return stack branch prediction.
! 128: define(mcount_movl_GOT_ebx,
! 129: m4_assert_numargs(-1)
! 130: ` call L(mcount_movl_GOT_ebx_C`'mcount_movl_GOT_ebx_counter)
! 131: L(mcount_movl_GOT_ebx_H`'mcount_movl_GOT_ebx_counter):
! 132: jmp L(mcount_movl_GOT_ebx_J`'mcount_movl_GOT_ebx_counter)
! 133: L(mcount_movl_GOT_ebx_C`'mcount_movl_GOT_ebx_counter):
! 134: movl (%esp), %ebx
! 135: ret
! 136: L(mcount_movl_GOT_ebx_J`'mcount_movl_GOT_ebx_counter):
! 137: addl $_GLOBAL_OFFSET_TABLE_+[.-L(mcount_movl_GOT_ebx_H`'mcount_movl_GOT_ebx_counter)], %ebx
! 138: define(`mcount_movl_GOT_ebx_counter',incr(mcount_movl_GOT_ebx_counter))')
1.1 maekawa 139:
1.1.1.2 ! ohara 140: define(mcount_movl_GOT_ebx_counter,1)
1.1 maekawa 141:
142:
143: dnl --------------------------------------------------------------------------
144: dnl Various x86 macros.
145: dnl
146:
147:
148: dnl Usage: ALIGN_OFFSET(bytes,offset)
149: dnl
150: dnl Align to `offset' away from a multiple of `bytes'.
151: dnl
152: dnl This is useful for testing, for example align to something very strict
153: dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
154: dnl
155: dnl Generally you wouldn't execute across the padding, but it's done with
156: dnl nop's so it'll work.
157:
158: define(ALIGN_OFFSET,
159: m4_assert_numargs(2)
160: `ALIGN($1)
161: forloop(`i',1,$2,` nop
162: ')')
163:
164:
165: dnl Usage: defframe(name,offset)
166: dnl
167: dnl Make a definition like the following with which to access a parameter
168: dnl or variable on the stack.
169: dnl
170: dnl define(name,`FRAME+offset(%esp)')
171: dnl
172: dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
173: dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
174: dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
175: dnl zero offset is wanted.
176: dnl
177: dnl The new macro also gets a check that when it's used FRAME is actually
178: dnl defined, and that the final %esp offset isn't negative, which would
179: dnl mean an attempt to access something below the current %esp.
180: dnl
181: dnl deflit() is used rather than a plain define(), so the new macro won't
182: dnl delete any following parenthesized expression. name(%edi) will come
183: dnl out say as 16(%esp)(%edi). This isn't valid assembler and should
184: dnl provoke an error, which is better than silently giving just 16(%esp).
185: dnl
1.1.1.2 ! ohara 186: dnl See README for more on the suggested way to access the stack frame.
1.1 maekawa 187:
188: define(defframe,
189: m4_assert_numargs(2)
190: `deflit(`$1',
191: m4_assert_defined(`FRAME')
192: `defframe_check_notbelow(`$1',$2,FRAME)dnl
193: defframe_empty_if_zero(FRAME+($2))(%esp)')')
194:
195: dnl Called: defframe_empty_if_zero(expression)
196: define(defframe_empty_if_zero,
1.1.1.2 ! ohara 197: m4_assert_numargs(1)
1.1 maekawa 198: `ifelse(defframe_empty_if_zero_disabled,1,
199: `eval($1)',
200: `m4_empty_if_zero($1)')')
201:
202: dnl Called: defframe_check_notbelow(`name',offset,FRAME)
203: define(defframe_check_notbelow,
204: m4_assert_numargs(3)
205: `ifelse(eval(($3)+($2)<0),1,
206: `m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
207: ')')')
208:
209:
210: dnl Usage: FRAME_pushl()
211: dnl FRAME_popl()
212: dnl FRAME_addl_esp(n)
213: dnl FRAME_subl_esp(n)
214: dnl
215: dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
216: dnl %esp of n bytes.
217: dnl
218: dnl Using these macros is completely optional. Sometimes it makes more
219: dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's
220: dnl jumps and different sequences of FRAME values need to be used in
221: dnl different places.
222:
223: define(FRAME_pushl,
224: m4_assert_numargs(0)
225: m4_assert_defined(`FRAME')
226: `deflit(`FRAME',eval(FRAME+4))')
227:
228: define(FRAME_popl,
229: m4_assert_numargs(0)
230: m4_assert_defined(`FRAME')
231: `deflit(`FRAME',eval(FRAME-4))')
232:
233: define(FRAME_addl_esp,
234: m4_assert_numargs(1)
235: m4_assert_defined(`FRAME')
236: `deflit(`FRAME',eval(FRAME-($1)))')
237:
238: define(FRAME_subl_esp,
239: m4_assert_numargs(1)
240: m4_assert_defined(`FRAME')
241: `deflit(`FRAME',eval(FRAME+($1)))')
242:
243:
244: dnl Usage: defframe_pushl(name)
245: dnl
1.1.1.2 ! ohara 246: dnl Do a combination FRAME_pushl() and a defframe() to name the stack
1.1 maekawa 247: dnl location just pushed. This should come after a pushl instruction.
248: dnl Putting it on the same line works and avoids lengthening the code. For
249: dnl example,
250: dnl
251: dnl pushl %eax defframe_pushl(VAR_COUNTER)
252: dnl
253: dnl Notice the defframe() is done with an unquoted -FRAME thus giving its
254: dnl current value without tracking future changes.
255:
256: define(defframe_pushl,
1.1.1.2 ! ohara 257: m4_assert_numargs(1)
1.1 maekawa 258: `FRAME_pushl()defframe(`$1',-FRAME)')
259:
260:
261: dnl --------------------------------------------------------------------------
262: dnl Assembler instruction macros.
263: dnl
264:
265:
266: dnl Usage: emms_or_femms
267: dnl femms_available_p
268: dnl
269: dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
270: dnl femms instruction is available. emms_or_femms expands to femms if
271: dnl available, or emms if not.
272: dnl
273: dnl emms_or_femms is meant for use in the K6 directory where plain K6
274: dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
275: dnl supported together.
276: dnl
277: dnl On K7 femms is no longer faster and is just an alias for emms, so plain
278: dnl emms may as well be used.
279:
280: define(femms_available_p,
281: m4_assert_numargs(-1)
282: `m4_ifdef_anyof_p(
1.1.1.2 ! ohara 283: `HAVE_HOST_CPU_k62',
! 284: `HAVE_HOST_CPU_k63',
! 285: `HAVE_HOST_CPU_athlon')')
1.1 maekawa 286:
287: define(emms_or_femms,
288: m4_assert_numargs(-1)
289: `ifelse(femms_available_p,1,`femms',`emms')')
290:
291:
292: dnl Usage: femms
293: dnl
1.1.1.2 ! ohara 294: dnl Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the
1.1 maekawa 295: dnl following is a replacement using .byte.
296: dnl
297: dnl If femms isn't available, an emms is generated instead, for convenience
298: dnl when testing on a machine without femms.
299:
300: define(femms,
301: m4_assert_numargs(-1)
302: `ifelse(femms_available_p,1,
303: `.byte 15,14 C AMD 3DNow femms',
304: `emms`'dnl
305: m4_warning(`warning, using emms in place of femms, use for testing only
306: ')')')
307:
308:
309: dnl Usage: jadcl0(op)
310: dnl
1.1.1.2 ! ohara 311: dnl Generate a jnc/incl as a substitute for adcl $0,op. Note this isn't an
! 312: dnl exact replacement, since it doesn't set the flags like adcl does.
1.1 maekawa 313: dnl
314: dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
315: dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch
316: dnl misprediction penalty is small, and the multiply algorithm used leads
317: dnl to a carry bit on average only 1/4 of the time.
318: dnl
1.1.1.2 ! ohara 319: dnl jadcl0_disabled can be set to 1 to instead generate an ordinary adcl
! 320: dnl for comparison. For example,
1.1 maekawa 321: dnl
322: dnl define(`jadcl0_disabled',1)
323: dnl
324: dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
325: dnl the same size as an adcl. This makes it possible to use the exact same
1.1.1.2 ! ohara 326: dnl computed jump code when testing the relative speed of the two.
1.1 maekawa 327:
328: define(jadcl0,
329: m4_assert_numargs(1)
330: `ifelse(jadcl0_disabled,1,
331: `adcl $`'0, $1',
1.1.1.2 ! ohara 332: `jnc L(jadcl0_`'jadcl0_counter)
1.1 maekawa 333: incl $1
1.1.1.2 ! ohara 334: L(jadcl0_`'jadcl0_counter):
! 335: define(`jadcl0_counter',incr(jadcl0_counter))')')
! 336:
! 337: define(jadcl0_counter,1)
1.1 maekawa 338:
339:
340: dnl Usage: cmov_available_p
341: dnl
342: dnl Expand to 1 if cmov is available, 0 if not.
343:
344: define(cmov_available_p,
1.1.1.2 ! ohara 345: m4_assert_numargs(-1)
1.1 maekawa 346: `m4_ifdef_anyof_p(
1.1.1.2 ! ohara 347: `HAVE_HOST_CPU_pentiumpro',
! 348: `HAVE_HOST_CPU_pentium2',
! 349: `HAVE_HOST_CPU_pentium3',
! 350: `HAVE_HOST_CPU_pentium4',
! 351: `HAVE_HOST_CPU_athlon')')
1.1 maekawa 352:
353:
354: dnl Usage: x86_lookup(target, key,value, key,value, ...)
355: dnl x86_lookup_p(target, key,value, key,value, ...)
356: dnl
357: dnl Look for `target' among the `key' parameters.
358: dnl
359: dnl x86_lookup expands to the corresponding `value', or generates an error
360: dnl if `target' isn't found.
361: dnl
362: dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not.
363:
364: define(x86_lookup,
1.1.1.2 ! ohara 365: m4_assert_numargs_range(1,999)
1.1 maekawa 366: `ifelse(eval($#<3),1,
367: `m4_error(`unrecognised part of x86 instruction: $1
368: ')',
369: `ifelse(`$1',`$2', `$3',
370: `x86_lookup(`$1',shift(shift(shift($@))))')')')
371:
372: define(x86_lookup_p,
1.1.1.2 ! ohara 373: m4_assert_numargs_range(1,999)
1.1 maekawa 374: `ifelse(eval($#<3),1, `0',
375: `ifelse(`$1',`$2', `1',
376: `x86_lookup_p(`$1',shift(shift(shift($@))))')')')
377:
378:
379: dnl Usage: x86_opcode_reg32(reg)
380: dnl x86_opcode_reg32_p(reg)
381: dnl
382: dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given
383: dnl 32-bit register, eg. `%ebp' turns into 5.
384: dnl
385: dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
386: dnl if not.
387:
388: define(x86_opcode_reg32,
389: m4_assert_numargs(1)
390: `x86_lookup(`$1',x86_opcode_reg32_list)')
391:
392: define(x86_opcode_reg32_p,
393: m4_assert_onearg()
394: `x86_lookup_p(`$1',x86_opcode_reg32_list)')
395:
396: define(x86_opcode_reg32_list,
397: ``%eax',0,
398: `%ecx',1,
399: `%edx',2,
400: `%ebx',3,
401: `%esp',4,
402: `%ebp',5,
403: `%esi',6,
404: `%edi',7')
405:
406:
407: dnl Usage: x86_opcode_tttn(cond)
408: dnl
409: dnl Expand to the 4-bit "tttn" field value for the given x86 branch
410: dnl condition (like `c', `ae', etc).
411:
412: define(x86_opcode_tttn,
413: m4_assert_numargs(1)
414: `x86_lookup(`$1',x86_opcode_ttn_list)')
415:
416: define(x86_opcode_tttn_list,
417: ``o', 0,
418: `no', 1,
419: `b', 2, `c', 2, `nae',2,
420: `nb', 3, `nc', 3, `ae', 3,
421: `e', 4, `z', 4,
422: `ne', 5, `nz', 5,
423: `be', 6, `na', 6,
424: `nbe', 7, `a', 7,
425: `s', 8,
426: `ns', 9,
427: `p', 10, `pe', 10, `npo',10,
428: `np', 11, `npe',11, `po', 11,
429: `l', 12, `nge',12,
430: `nl', 13, `ge', 13,
431: `le', 14, `ng', 14,
432: `nle',15, `g', 15')
433:
434:
435: dnl Usage: cmovCC(srcreg,dstreg)
436: dnl
1.1.1.2 ! ohara 437: dnl Generate a cmov instruction if the host supports cmov, or simulate it
1.1 maekawa 438: dnl with a conditional jump if not (the latter being meant only for
439: dnl testing). For example,
440: dnl
441: dnl cmovz( %eax, %ebx)
442: dnl
443: dnl cmov instructions are generated using .byte sequences, since only
444: dnl recent versions of gas know cmov.
445: dnl
446: dnl The source operand can only be a plain register. (m4 code implementing
447: dnl full memory addressing modes exists, believe it or not, but isn't
448: dnl currently needed and isn't included.)
449: dnl
450: dnl All the standard conditions are defined. Attempting to use one without
451: dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
452: dnl an error. This ensures the necessary .byte sequences aren't
453: dnl accidentally missed.
454:
455: dnl Called: define_cmov_many(cond,tttn,cond,tttn,...)
456: define(define_cmov_many,
457: `ifelse(m4_length(`$1'),0,,
458: `define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
459:
460: dnl Called: define_cmov(cond,tttn)
461: define(define_cmov,
462: m4_assert_numargs(2)
463: `define(`cmov$1',
464: m4_instruction_wrapper()
465: m4_assert_numargs(2)
466: `cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
467: m4_doublequote($`'1),m4_doublequote($`'2)))')
468:
469: define_cmov_many(x86_opcode_tttn_list)
470:
471:
472: dnl Called: cmov_internal(name,cond,tttn,src,dst)
473: define(cmov_internal,
474: m4_assert_numargs(5)
475: `ifelse(cmov_available_p,1,
476: `cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
477: `m4_warning(`warning, simulating cmov with jump, use for testing only
478: ')cmov_simulate(`$2',`$4',`$5')')')
479:
480: dnl Called: cmov_simulate(cond,src,dst)
481: dnl If this is going to be used with memory operands for the source it will
482: dnl need to be changed to do a fetch even if the condition is false, so as
483: dnl to trigger exceptions the same way a real cmov does.
484: define(cmov_simulate,
485: m4_assert_numargs(3)
1.1.1.2 ! ohara 486: `j$1 L(cmov_T`'cmov_counter) C cmov$1 $2, $3
! 487: jmp L(cmov_F`'cmov_counter)
! 488: L(cmov_T`'cmov_counter):
! 489: movl $2, $3
! 490: L(cmov_F`'cmov_counter):
! 491: define(`cmov_counter',incr(cmov_counter))')
! 492:
! 493: define(cmov_counter,1)
1.1 maekawa 494:
495: dnl Called: cmov_bytes_tttn(name,tttn,src,dst)
496: define(cmov_bytes_tttn,
497: m4_assert_numargs(4)
498: `.byte dnl
499: 15, dnl
500: eval(64+$2), dnl
501: eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
502: C `$1 $3, $4'')
503:
504:
1.1.1.2 ! ohara 505: dnl Usage: x86_opcode_regmmx(reg)
! 506: dnl
! 507: dnl Validate the given mmx register, and return its number, 0 to 7.
! 508:
! 509: define(x86_opcode_regmmx,
! 510: m4_assert_numargs(1)
! 511: `x86_lookup(`$1',x86_opcode_regmmx_list)')
! 512:
! 513: define(x86_opcode_regmmx_list,
! 514: ``%mm0',0,
! 515: `%mm1',1,
! 516: `%mm2',2,
! 517: `%mm3',3,
! 518: `%mm4',4,
! 519: `%mm5',5,
! 520: `%mm6',6,
! 521: `%mm7',7')
! 522:
! 523:
! 524: dnl Usage: psadbw(src,dst)
! 525: dnl
! 526: dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on
! 527: dnl FreeBSD 3.3 and 3.4 doesn't recognise it, so instead emit .byte
! 528: dnl sequences.
! 529: dnl
! 530: dnl Only register->register forms are supported here, which suffices for
! 531: dnl the current code.
! 532:
! 533: define(psadbw,
! 534: m4_instruction_wrapper()
! 535: m4_assert_numargs(2)
! 536: `ifelse(psadbw_available_p,1,
! 537: `psadbw_bytes(`$1',`$2')',
! 538: `psadbw_simulate(`$1',`$2')')')
! 539:
! 540: define(psadbw_available_p,
! 541: m4_assert_numargs(-1)
! 542: `m4_ifdef_anyof_p(`HAVE_HOST_CPU_pentium3',
! 543: `HAVE_HOST_CPU_pentium4',
! 544: `HAVE_HOST_CPU_athlon')')
! 545:
! 546: dnl Called: psadbw_bytes(src,dst)
! 547: define(psadbw_bytes,
! 548: m4_assert_numargs(2)
! 549: `.byte 0x0f,0xf6,dnl
! 550: eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl
! 551: C `psadbw $1, $2'')
! 552:
! 553: dnl Called: psadbw_simulate(src,dst)
! 554: define(psadbw_simulate,
! 555: m4_assert_numargs(2)
! 556: `m4_warning(`warning, using simulated and only partly functional psadbw, use testing only
! 557: ') C This works enough for the sum of bytes done in some of the popcounts,
! 558: C but is otherwise a long way short of correct.
! 559: pushl %eax
! 560: pushl %edx
! 561: pushf
! 562: subl $`'8, %esp
! 563: movq $2, (%esp)
! 564: movzbl (%esp), %eax
! 565: forloop(i,1,7,
! 566: ` movzbl i`'(%esp), %edx
! 567: addl %edx, %eax
! 568: ')
! 569: movd %eax, $2
! 570: addl $`'8, %esp
! 571: popf
! 572: popl %edx
! 573: popl %eax
! 574: ')
! 575:
! 576:
1.1 maekawa 577: dnl Usage: loop_or_decljnz label
578: dnl
579: dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
580: dnl is better. "loop" is better on K6 and probably on 386, on other chips
581: dnl separate decl/jnz is better.
582: dnl
583: dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
584: dnl this loop_or_decljnz variation is enough to let the code be shared by
585: dnl all chips.
586:
587: define(loop_or_decljnz,
1.1.1.2 ! ohara 588: m4_assert_numargs(-1)
1.1 maekawa 589: `ifelse(loop_is_better_p,1,
590: `loop',
591: `decl %ecx
592: jnz')')
593:
594: define(loop_is_better_p,
1.1.1.2 ! ohara 595: m4_assert_numargs(-1)
! 596: `m4_ifdef_anyof_p(`HAVE_HOST_CPU_k6',
! 597: `HAVE_HOST_CPU_k62',
! 598: `HAVE_HOST_CPU_k63',
! 599: `HAVE_HOST_CPU_i386')')
1.1 maekawa 600:
601:
602: dnl Usage: Zdisp(inst,op,op,op)
603: dnl
604: dnl Generate explicit .byte sequences if necessary to force a byte-sized
605: dnl zero displacement on an instruction. For example,
606: dnl
607: dnl Zdisp( movl, 0,(%esi), %eax)
608: dnl
609: dnl expands to
610: dnl
611: dnl .byte 139,70,0 C movl 0(%esi), %eax
612: dnl
613: dnl If the displacement given isn't 0, then normal assembler code is
614: dnl generated. For example,
615: dnl
616: dnl Zdisp( movl, 4,(%esi), %eax)
617: dnl
618: dnl expands to
619: dnl
620: dnl movl 4(%esi), %eax
621: dnl
622: dnl This means a single Zdisp() form can be used with an expression for the
623: dnl displacement, and .byte will be used only if necessary. The
624: dnl displacement argument is eval()ed.
625: dnl
626: dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is
627: dnl implemented with a table of instructions and encodings. A new entry is
1.1.1.2 ! ohara 628: dnl needed for any different operation or registers. The table is split
! 629: dnl into separate macros to avoid overflowing BSD m4 macro expansion space.
1.1 maekawa 630:
631: define(Zdisp,
1.1.1.2 ! ohara 632: m4_assert_numargs(4)
1.1 maekawa 633: `define(`Zdisp_found',0)dnl
1.1.1.2 ! ohara 634: Zdisp_1($@)dnl
! 635: Zdisp_2($@)dnl
! 636: Zdisp_3($@)dnl
! 637: Zdisp_4($@)dnl
! 638: ifelse(Zdisp_found,0,
! 639: `m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
! 640: ')')')
! 641:
! 642: define(Zdisp_1,`dnl
! 643: Zdisp_match( adcl, 0,(%edx), %eax, `0x13,0x42,0x00', $@)`'dnl
! 644: Zdisp_match( adcl, 0,(%edx), %ebx, `0x13,0x5a,0x00', $@)`'dnl
! 645: Zdisp_match( adcl, 0,(%edx), %esi, `0x13,0x72,0x00', $@)`'dnl
! 646: Zdisp_match( addl, %ebx, 0,(%edi), `0x01,0x5f,0x00', $@)`'dnl
! 647: Zdisp_match( addl, %ecx, 0,(%edi), `0x01,0x4f,0x00', $@)`'dnl
! 648: Zdisp_match( addl, %esi, 0,(%edi), `0x01,0x77,0x00', $@)`'dnl
! 649: Zdisp_match( sbbl, 0,(%edx), %eax, `0x1b,0x42,0x00', $@)`'dnl
! 650: Zdisp_match( sbbl, 0,(%edx), %esi, `0x1b,0x72,0x00', $@)`'dnl
! 651: Zdisp_match( subl, %ecx, 0,(%edi), `0x29,0x4f,0x00', $@)`'dnl
! 652: Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl
! 653: Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl
! 654: ')
! 655: define(Zdisp_2,`dnl
! 656: Zdisp_match( movl, %eax, 0,(%edi), `0x89,0x47,0x00', $@)`'dnl
! 657: Zdisp_match( movl, %ebx, 0,(%edi), `0x89,0x5f,0x00', $@)`'dnl
! 658: Zdisp_match( movl, %esi, 0,(%edi), `0x89,0x77,0x00', $@)`'dnl
! 659: Zdisp_match( movl, 0,(%ebx), %eax, `0x8b,0x43,0x00', $@)`'dnl
! 660: Zdisp_match( movl, 0,(%ebx), %esi, `0x8b,0x73,0x00', $@)`'dnl
! 661: Zdisp_match( movl, 0,(%edx), %eax, `0x8b,0x42,0x00', $@)`'dnl
! 662: Zdisp_match( movl, 0,(%esi), %eax, `0x8b,0x46,0x00', $@)`'dnl
1.1 maekawa 663: Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
1.1.1.2 ! ohara 664: ')
! 665: define(Zdisp_3,`dnl
1.1 maekawa 666: Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
667: Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
668: Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
1.1.1.2 ! ohara 669: Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl
! 670: Zdisp_match( movq, 0,(%edx), %mm0, `0x0f,0x6f,0x42,0x00', $@)`'dnl
! 671: Zdisp_match( movq, 0,(%esi), %mm0, `0x0f,0x6f,0x46,0x00', $@)`'dnl
! 672: Zdisp_match( movq, %mm0, 0,(%edi), `0x0f,0x7f,0x47,0x00', $@)`'dnl
1.1 maekawa 673: Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
674: Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
675: Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
1.1.1.2 ! ohara 676: ')
! 677: define(Zdisp_4,`dnl
! 678: Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl
1.1 maekawa 679: Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
680: Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
681: Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
682: Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
683: Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
1.1.1.2 ! ohara 684: Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl
! 685: ')
1.1 maekawa 686:
687: define(Zdisp_match,
1.1.1.2 ! ohara 688: m4_assert_numargs(9)
1.1 maekawa 689: `ifelse(eval(m4_stringequal_p(`$1',`$6')
690: && m4_stringequal_p(`$2',0)
691: && m4_stringequal_p(`$3',`$8')
692: && m4_stringequal_p(`$4',`$9')),1,
693: `define(`Zdisp_found',1)dnl
694: ifelse(eval(`$7'),0,
695: ` .byte $5 C `$1 0$3, $4'',
696: ` $6 $7$8, $9')',
697:
698: `ifelse(eval(m4_stringequal_p(`$1',`$6')
699: && m4_stringequal_p(`$2',`$7')
700: && m4_stringequal_p(`$3',0)
701: && m4_stringequal_p(`$4',`$9')),1,
702: `define(`Zdisp_found',1)dnl
703: ifelse(eval(`$8'),0,
704: ` .byte $5 C `$1 $2, 0$4'',
705: ` $6 $7, $8$9')')')')
706:
707:
708: dnl Usage: shldl(count,src,dst)
709: dnl shrdl(count,src,dst)
710: dnl shldw(count,src,dst)
711: dnl shrdw(count,src,dst)
712: dnl
713: dnl Generate a double-shift instruction, possibly omitting a %cl count
714: dnl parameter if that's what the assembler requires, as indicated by
715: dnl WANT_SHLDL_CL in config.m4. For example,
716: dnl
717: dnl shldl( %cl, %eax, %ebx)
718: dnl
719: dnl turns into either
720: dnl
721: dnl shldl %cl, %eax, %ebx
722: dnl or
723: dnl shldl %eax, %ebx
724: dnl
725: dnl Immediate counts are always passed through unchanged. For example,
726: dnl
727: dnl shrdl( $2, %esi, %edi)
728: dnl becomes
729: dnl shrdl $2, %esi, %edi
730: dnl
731: dnl
732: dnl If you forget to use the macro form "shldl( ...)" and instead write
733: dnl just a plain "shldl ...", an error results. This ensures the necessary
734: dnl variant treatment of %cl isn't accidentally bypassed.
735:
736: define(define_shd_instruction,
1.1.1.2 ! ohara 737: m4_assert_numargs(1)
1.1 maekawa 738: `define($1,
739: m4_instruction_wrapper()
740: m4_assert_numargs(3)
741: `shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
742: m4_doublequote($`'2),m4_doublequote($`'3)))')
743:
744: dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
745: define_shd_instruction(shldl)
746: define_shd_instruction(shrdl)
747: define_shd_instruction(shldw)
748: define_shd_instruction(shrdw)
749:
750: dnl Called: shd_instruction(op,count,src,dst)
751: define(shd_instruction,
752: m4_assert_numargs(4)
753: m4_assert_defined(`WANT_SHLDL_CL')
754: `ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
755: ``$1' `$3', `$4'',
756: ``$1' `$2', `$3', `$4'')')
757:
758:
1.1.1.2 ! ohara 759: dnl Usage: ASSERT([cond][,instructions])
1.1 maekawa 760: dnl
761: dnl If WANT_ASSERT is 1, output the given instructions and expect the given
762: dnl flags condition to then be satisfied. For example,
763: dnl
764: dnl ASSERT(ne, `cmpl %eax, %ebx')
765: dnl
766: dnl The instructions can be omitted to just assert a flags condition with
767: dnl no extra calculation. For example,
768: dnl
769: dnl ASSERT(nc)
770: dnl
771: dnl When `instructions' is not empty, a pushf/popf is added to preserve the
772: dnl flags, but the instructions themselves must preserve any registers that
773: dnl matter. FRAME is adjusted for the push and pop, so the instructions
774: dnl given can use defframe() stack variables.
1.1.1.2 ! ohara 775: dnl
! 776: dnl The condition can be omitted to just output the given instructions when
! 777: dnl assertion checking is wanted. In this case the pushf/popf is omitted.
! 778: dnl For example,
! 779: dnl
! 780: dnl ASSERT(, `movl %eax, VAR_KEEPVAL')
1.1 maekawa 781:
782: define(ASSERT,
783: m4_assert_numargs_range(1,2)
784: `ifelse(WANT_ASSERT,1,
1.1.1.2 ! ohara 785: `ifelse(`$1',,
! 786: `$2',
1.1 maekawa 787: `C ASSERT
788: ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')')
789: $2
1.1.1.2 ! ohara 790: j`$1' L(ASSERT_ok`'ASSERT_counter)
1.1 maekawa 791: ud2 C assertion failed
1.1.1.2 ! ohara 792: L(ASSERT_ok`'ASSERT_counter):
1.1 maekawa 793: ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')')
1.1.1.2 ! ohara 794: define(`ASSERT_counter',incr(ASSERT_counter))')')')
! 795:
! 796: define(ASSERT_counter,1)
1.1 maekawa 797:
798:
799: dnl Usage: movl_text_address(label,register)
800: dnl
801: dnl Get the address of a text segment label, using either a plain movl or a
802: dnl position-independent calculation, as necessary. For example,
803: dnl
804: dnl movl_code_address(L(foo),%eax)
805: dnl
806: dnl This macro is only meant for use in ASSERT()s or when testing, since
807: dnl the PIC sequence it generates will want to be done with a ret balancing
808: dnl the call on CPUs with return address branch predition.
809: dnl
1.1.1.2 ! ohara 810: dnl The addl generated here has a backward reference to the label, and so
! 811: dnl won't suffer from the two forwards references bug in old gas (described
! 812: dnl in mpn/x86/README).
1.1 maekawa 813:
814: define(movl_text_address,
1.1.1.2 ! ohara 815: m4_assert_numargs(2)
1.1 maekawa 816: `ifdef(`PIC',
1.1.1.2 ! ohara 817: `call L(movl_text_address_`'movl_text_address_counter)
! 818: L(movl_text_address_`'movl_text_address_counter):
! 819: popl $2 C %eip
! 820: addl `$'$1-L(movl_text_address_`'movl_text_address_counter), $2
! 821: define(`movl_text_address_counter',incr(movl_text_address_counter))',
1.1 maekawa 822: `movl `$'$1, $2')')
1.1.1.2 ! ohara 823:
! 824: define(movl_text_address_counter,1)
! 825:
! 826:
! 827: dnl Usage: notl_or_xorl_GMP_NUMB_MASK(reg)
! 828: dnl
! 829: dnl Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as
! 830: dnl appropriate for nails in use or not.
! 831:
! 832: define(notl_or_xorl_GMP_NUMB_MASK,
! 833: m4_assert_numargs(1)
! 834: `ifelse(GMP_NAIL_BITS,0,
! 835: `notl `$1'',
! 836: `xorl $GMP_NUMB_MASK, `$1'')')
1.1 maekawa 837:
838:
839: divert`'dnl
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>