[BACK]Return to submul_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / alpha / ev6

Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/submul_1.asm, Revision 1.1.1.1

1.1       ohara       1: dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
                      2: dnl the result from a second limb vector.
                      3:
                      4: dnl  Copyright 2000 Free Software Foundation, Inc.
                      5:
                      6: dnl  This file is part of the GNU MP Library.
                      7:
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or modify
                      9: dnl  it under the terms of the GNU Lesser General Public License as published
                     10: dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
                     11: dnl  your option) any later version.
                     12:
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful, but
                     14: dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     16: dnl  License for more details.
                     17:
                     18: dnl  You should have received a copy of the GNU Lesser General Public License
                     19: dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: dnl  MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25: dnl  INPUT PARAMETERS
                     26: dnl  res_ptr   r16
                     27: dnl  s1_ptr    r17
                     28: dnl  size      r18
                     29: dnl  s2_limb   r19
                     30:
                     31: dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
                     32: dnl  exactly 3.5 cycles/limb on EV6...
                     33:
                     34: dnl This code was written in close cooperation with ev6 pipeline expert
                     35: dnl Steve Root.  Any errors are tege's fault, though.
                     36: dnl
                     37: dnl   Register usages for unrolled loop:
                     38: dnl      0-3     mul's
                     39: dnl      4-7     acc's
                     40: dnl      8-15    mul results
                     41: dnl      20,21   carry's
                     42: dnl      22,23   save for stores
                     43:
                     44: dnl   Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
                     45:
                     46: dnl   The stores can issue a cycle late so we have paired no-op's to 'catch'
                     47: dnl   them, so that further disturbance to the schedule is damped.
                     48:
                     49: dnl   We couldn't pair the loads, because the entangled schedule of the
                     50: dnl   carry's has to happen on one side {0} of the machine. Note, the total
                     51: dnl   use of U0, and the total use of L0 (after attending to the stores).
                     52: dnl   which is part of the reason why....
                     53:
                     54: dnl   This is a great schedule for the d_cache, a poor schedule for the
                     55: dnl   b_cache. The lockup on U0 means that any stall can't be recovered
                     56: dnl   from. Consider a ldq in L1.  say that load gets stalled because it
                     57: dnl   collides with a fill from the b_Cache. On the next cycle, this load
                     58: dnl   gets priority. If first looks at L0, and goes there. The instruction
                     59: dnl   we intended for L0 gets to look at L1, which is NOT where we want
                     60: dnl   it. It either stalls 1, because it can't go in L0, or goes there, and
                     61: dnl   causes a further instruction to stall.
                     62:
                     63: dnl   So for b_cache, we're likely going to want to put one or more cycles
                     64: dnl   back into the code! And, of course, put in prefetches. For the
                     65: dnl   accumulator, lds, intent to modify.  For the multiplier, you might
                     66: dnl   want ldq, evict next, if you're not wanting to use it again soon. Use
                     67: dnl   256 ahead of present pointer value. At a place where we have an mt
                     68: dnl   followed by a bookkeeping, put the bookkeeping in upper, and the
                     69: dnl   prefetch into lower.
                     70:
                     71: dnl   Note, the usage of physical registers per cycle is smoothed off, as
                     72: dnl   much as possible.
                     73:
                     74: dnl   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
                     75: dnl   like not to have a ldq or stq to preceded a conditional branch in a
                     76: dnl   quadpack. The conditional branch moves the retire pointer one cycle
                     77: dnl   later.
                     78:
                     79: dnl   Optimization notes:
                     80: dnl   Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
                     81: dnl   Reserved regs:    r29 r30 r31
                     82: dnl   Free caller-saves regs in unrolled code: r24 r25 r28
                     83: dnl   We should swap some of the callee-saves regs for some of the free
                     84: dnl   caller-saves regs, saving some overhead cycles.
                     85: dnl   Most importantly, we should write fast code for the 0-7 case.
                     86: dnl   The code we use there are for the 21164, and runs at 7 cycles/limb
                     87: dnl   on the 21264.  Should not be hard, if we write specialized code for
                     88: dnl   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
                     89: dnl   need a jump table indexed by the low 3 bits of the count argument.
                     90:
                     91:
                     92: ASM_START()
                     93: PROLOGUE(mpn_submul_1)
                     94:        cmpult  r18,    8,      r1
                     95:        beq     r1,     $Large
                     96:
                     97:        ldq     r2,     0(r17)          C r2 = s1_limb
                     98:        addq    r17,    8,      r17     C s1_ptr++
                     99:        subq    r18,    1,      r18     C size--
                    100:        mulq    r2,     r19,    r3      C r3 = prod_low
                    101:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    102:        umulh   r2,     r19,    r0      C r0 = prod_high
                    103:        beq     r18,    $Lend0b         C jump if size was == 1
                    104:        ldq     r2,     0(r17)          C r2 = s1_limb
                    105:        addq    r17,    8,      r17     C s1_ptr++
                    106:        subq    r18,    1,      r18     C size--
                    107:        subq    r5,     r3,     r3
                    108:        cmpult  r5,     r3,     r4
                    109:        stq     r3,     0(r16)
                    110:        addq    r16,    8,      r16     C res_ptr++
                    111:        beq     r18,    $Lend0a         C jump if size was == 2
                    112:
                    113:        ALIGN(8)
                    114: $Loop0:        mulq    r2,     r19,    r3      C r3 = prod_low
                    115:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    116:        addq    r4,     r0,     r0      C cy_limb = cy_limb + 'cy'
                    117:        subq    r18,    1,      r18     C size--
                    118:        umulh   r2,     r19,    r4      C r4 = cy_limb
                    119:        ldq     r2,     0(r17)          C r2 = s1_limb
                    120:        addq    r17,    8,      r17     C s1_ptr++
                    121:        addq    r3,     r0,     r3      C r3 = cy_limb + prod_low
                    122:        cmpult  r3,     r0,     r0      C r0 = carry from (cy_limb + prod_low)
                    123:        subq    r5,     r3,     r3
                    124:        cmpult  r5,     r3,     r5
                    125:        stq     r3,     0(r16)
                    126:        addq    r16,    8,      r16     C res_ptr++
                    127:        addq    r5,     r0,     r0      C combine carries
                    128:        bne     r18,    $Loop0
                    129: $Lend0a:
                    130:        mulq    r2,     r19,    r3      C r3 = prod_low
                    131:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    132:        addq    r4,     r0,     r0      C cy_limb = cy_limb + 'cy'
                    133:        umulh   r2,     r19,    r4      C r4 = cy_limb
                    134:        addq    r3,     r0,     r3      C r3 = cy_limb + prod_low
                    135:        cmpult  r3,     r0,     r0      C r0 = carry from (cy_limb + prod_low)
                    136:        subq    r5,     r3,     r3
                    137:        cmpult  r5,     r3,     r5
                    138:        stq     r3,     0(r16)
                    139:        addq    r5,     r0,     r0      C combine carries
                    140:        addq    r4,     r0,     r0      C cy_limb = prod_high + cy
                    141:        ret     r31,    (r26),  1
                    142: $Lend0b:
                    143:        subq    r5,     r3,     r3
                    144:        cmpult  r5,     r3,     r5
                    145:        stq     r3,     0(r16)
                    146:        addq    r0,     r5,     r0
                    147:        ret     r31,    (r26),  1
                    148:
                    149: $Large:
                    150:        lda     $30,    -240($30)
                    151:        stq     $9,     8($30)
                    152:        stq     $10,    16($30)
                    153:        stq     $11,    24($30)
                    154:        stq     $12,    32($30)
                    155:        stq     $13,    40($30)
                    156:        stq     $14,    48($30)
                    157:        stq     $15,    56($30)
                    158:
                    159:        and     r18,    7,      r20     C count for the first loop, 0-7
                    160:        srl     r18,    3,      r18     C count for unrolled loop
                    161:        bis     r31,    r31,    r0
                    162:        beq     r20,    $Lunroll
                    163:        ldq     r2,     0(r17)          C r2 = s1_limb
                    164:        addq    r17,    8,      r17     C s1_ptr++
                    165:        subq    r20,    1,      r20     C size--
                    166:        mulq    r2,     r19,    r3      C r3 = prod_low
                    167:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    168:        umulh   r2,     r19,    r0      C r0 = prod_high
                    169:        beq     r20,    $Lend1b         C jump if size was == 1
                    170:        ldq     r2,     0(r17)          C r2 = s1_limb
                    171:        addq    r17,    8,      r17     C s1_ptr++
                    172:        subq    r20,    1,      r20     C size--
                    173:        subq    r5,     r3,     r3
                    174:        cmpult  r5,     r3,     r4
                    175:        stq     r3,     0(r16)
                    176:        addq    r16,    8,      r16     C res_ptr++
                    177:        beq     r20,    $Lend1a         C jump if size was == 2
                    178:
                    179:        ALIGN(8)
                    180: $Loop1:        mulq    r2,     r19,    r3      C r3 = prod_low
                    181:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    182:        addq    r4,     r0,     r0      C cy_limb = cy_limb + 'cy'
                    183:        subq    r20,    1,      r20     C size--
                    184:        umulh   r2,     r19,    r4      C r4 = cy_limb
                    185:        ldq     r2,     0(r17)          C r2 = s1_limb
                    186:        addq    r17,    8,      r17     C s1_ptr++
                    187:        addq    r3,     r0,     r3      C r3 = cy_limb + prod_low
                    188:        cmpult  r3,     r0,     r0      C r0 = carry from (cy_limb + prod_low)
                    189:        subq    r5,     r3,     r3
                    190:        cmpult  r5,     r3,     r5
                    191:        stq     r3,     0(r16)
                    192:        addq    r16,    8,      r16     C res_ptr++
                    193:        addq    r5,     r0,     r0      C combine carries
                    194:        bne     r20,    $Loop1
                    195:
                    196: $Lend1a:
                    197:        mulq    r2,     r19,    r3      C r3 = prod_low
                    198:        ldq     r5,     0(r16)          C r5 = *res_ptr
                    199:        addq    r4,     r0,     r0      C cy_limb = cy_limb + 'cy'
                    200:        umulh   r2,     r19,    r4      C r4 = cy_limb
                    201:        addq    r3,     r0,     r3      C r3 = cy_limb + prod_low
                    202:        cmpult  r3,     r0,     r0      C r0 = carry from (cy_limb + prod_low)
                    203:        subq    r5,     r3,     r3
                    204:        cmpult  r5,     r3,     r5
                    205:        stq     r3,     0(r16)
                    206:        addq    r16,    8,      r16     C res_ptr++
                    207:        addq    r5,     r0,     r0      C combine carries
                    208:        addq    r4,     r0,     r0      C cy_limb = prod_high + cy
                    209:        br      r31,    $Lunroll
                    210: $Lend1b:
                    211:        subq    r5,     r3,     r3
                    212:        cmpult  r5,     r3,     r5
                    213:        stq     r3,     0(r16)
                    214:        addq    r16,    8,      r16     C res_ptr++
                    215:        addq    r0,     r5,     r0
                    216:
                    217: $Lunroll:
                    218:        lda     r17,    -16(r17)        C L1 bookkeeping
                    219:        lda     r16,    -16(r16)        C L1 bookkeeping
                    220:        bis     r0,     r31,    r12
                    221:
                    222: C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
                    223:
                    224:        ldq     r2,     16(r17)         C L1
                    225:        ldq     r3,     24(r17)         C L1
                    226:        lda     r18,    -1(r18)         C L1 bookkeeping
                    227:        ldq     r6,     16(r16)         C L1
                    228:        ldq     r7,     24(r16)         C L1
                    229:        ldq     r0,     32(r17)         C L1
                    230:        mulq    r19,    r2,     r13     C U1
                    231:        ldq     r1,     40(r17)         C L1
                    232:        umulh   r19,    r2,     r14     C U1
                    233:        mulq    r19,    r3,     r15     C U1
                    234:        lda     r17,    64(r17)         C L1 bookkeeping
                    235:        ldq     r4,     32(r16)         C L1
                    236:        ldq     r5,     40(r16)         C L1
                    237:        umulh   r19,    r3,     r8      C U1
                    238:        ldq     r2,     -16(r17)        C L1
                    239:        mulq    r19,    r0,     r9      C U1
                    240:        ldq     r3,     -8(r17)         C L1
                    241:        umulh   r19,    r0,     r10     C U1
                    242:        subq    r6,     r13,    r13     C L0 lo + acc
                    243:        mulq    r19,    r1,     r11     C U1
                    244:        cmpult  r6,     r13,    r20     C L0 lo add => carry
                    245:        lda     r16,    64(r16)         C L1 bookkeeping
                    246:        subq    r13,    r12,    r22     C U0 hi add => answer
                    247:        cmpult  r13,    r12,    r21     C L0 hi add => carry
                    248:        addq    r14,    r20,    r14     C U0 hi mul + carry
                    249:        ldq     r6,     -16(r16)        C L1
                    250:        subq    r7,     r15,    r28     C L0 lo + acc
                    251:        addq    r14,    r21,    r14     C U0 hi mul + carry
                    252:        cmpult  r7,     r15,    r20     C L0 lo add => carry
                    253:        ldq     r7,     -8(r16)         C L1
                    254:        umulh   r19,    r1,     r12     C U1
                    255:        subq    r28,    r14,    r23     C U0 hi add => answer
                    256:        ldq     r0,     0(r17)          C L1
                    257:        mulq    r19,    r2,     r13     C U1
                    258:        cmpult  r28,    r14,    r21     C L0 hi add => carry
                    259:        addq    r8,     r20,    r8      C U0 hi mul + carry
                    260:        ldq     r1,     8(r17)          C L1
                    261:        umulh   r19,    r2,     r14     C U1
                    262:        subq    r4,     r9,     r9      C L0 lo + acc
                    263:        stq     r22,    -48(r16)        C L0
                    264:        stq     r23,    -40(r16)        C L1
                    265:        mulq    r19,    r3,     r15     C U1
                    266:        addq    r8,     r21,    r8      C U0 hi mul + carry
                    267:        cmpult  r4,     r9,     r20     C L0 lo add => carry
                    268:        subq    r9,     r8,     r22     C U0 hi add => answer
                    269:        ble     r18,    $Lend           C U1 bookkeeping
                    270:
                    271: C ____ MAIN UNROLLED LOOP ____
                    272:        ALIGN(16)
                    273: $Loop:
                    274:        bis     r31,    r31,    r31     C U1 mt
                    275:        cmpult  r9,     r8,     r21     C L0 hi add => carry
                    276:        addq    r10,    r20,    r10     C U0 hi mul + carry
                    277:        ldq     r4,     0(r16)          C L1
                    278:
                    279:        bis     r31,    r31,    r31     C U1 mt
                    280:        subq    r5,     r11,    r23     C L0 lo + acc
                    281:        addq    r10,    r21,    r10     C L0 hi mul + carry
                    282:        ldq     r2,     16(r17)         C L1
                    283:
                    284:        umulh   r19,    r3,     r8      C U1
                    285:        cmpult  r5,     r11,    r20     C L0 lo add => carry
                    286:        subq    r23,    r10,    r28     C U0 hi add => answer
                    287:        ldq     r5,     8(r16)          C L1
                    288:
                    289:        mulq    r19,    r0,     r9      C U1
                    290:        cmpult  r23,    r10,    r21     C L0 hi add => carry
                    291:        addq    r12,    r20,    r12     C U0 hi mul + carry
                    292:        ldq     r3,     24(r17)         C L1
                    293:
                    294:        umulh   r19,    r0,     r10     C U1
                    295:        subq    r6,     r13,    r13     C L0 lo + acc
                    296:        stq     r22,    -32(r16)        C L0
                    297:        stq     r28,    -24(r16)        C L1
                    298:
                    299:        bis     r31,    r31,    r31     C L0 st slosh
                    300:        mulq    r19,    r1,     r11     C U1
                    301:        bis     r31,    r31,    r31     C L1 st slosh
                    302:        addq    r12,    r21,    r12     C U0 hi mul + carry
                    303:
                    304:        cmpult  r6,     r13,    r20     C L0 lo add => carry
                    305:        bis     r31,    r31,    r31     C U1 mt
                    306:        lda     r18,    -1(r18)         C L1 bookkeeping
                    307:        subq    r13,    r12,    r22     C U0 hi add => answer
                    308:
                    309:        bis     r31,    r31,    r31     C U1 mt
                    310:        cmpult  r13,    r12,    r21     C L0 hi add => carry
                    311:        addq    r14,    r20,    r14     C U0 hi mul + carry
                    312:        ldq     r6,     16(r16)         C L1
                    313:
                    314:        bis     r31,    r31,    r31     C U1 mt
                    315:        subq    r7,     r15,    r23     C L0 lo + acc
                    316:        addq    r14,    r21,    r14     C U0 hi mul + carry
                    317:        ldq     r0,     32(r17)         C L1
                    318:
                    319:        umulh   r19,    r1,     r12     C U1
                    320:        cmpult  r7,     r15,    r20     C L0 lo add => carry
                    321:        subq    r23,    r14,    r28     C U0 hi add => answer
                    322:        ldq     r7,     24(r16)         C L1
                    323:
                    324:        mulq    r19,    r2,     r13     C U1
                    325:        cmpult  r23,    r14,    r21     C L0 hi add => carry
                    326:        addq    r8,     r20,    r8      C U0 hi mul + carry
                    327:        ldq     r1,     40(r17)         C L1
                    328:
                    329:        umulh   r19,    r2,     r14     C U1
                    330:        subq    r4,     r9,     r9      C U0 lo + acc
                    331:        stq     r22,    -16(r16)        C L0
                    332:        stq     r28,    -8(r16)         C L1
                    333:
                    334:        bis     r31,    r31,    r31     C L0 st slosh
                    335:        mulq    r19,    r3,     r15     C U1
                    336:        bis     r31,    r31,    r31     C L1 st slosh
                    337:        addq    r8,     r21,    r8      C L0 hi mul + carry
                    338:
                    339:        cmpult  r4,     r9,     r20     C L0 lo add => carry
                    340:        bis     r31,    r31,    r31     C U1 mt
                    341:        lda     r17,    64(r17)         C L1 bookkeeping
                    342:        subq    r9,     r8,     r22     C U0 hi add => answer
                    343:
                    344:        bis     r31,    r31,    r31     C U1 mt
                    345:        cmpult  r9,     r8,     r21     C L0 hi add => carry
                    346:        addq    r10,    r20,    r10     C U0 hi mul + carry
                    347:        ldq     r4,     32(r16)         C L1
                    348:
                    349:        bis     r31,    r31,    r31     C U1 mt
                    350:        subq    r5,     r11,    r23     C L0 lo + acc
                    351:        addq    r10,    r21,    r10     C L0 hi mul + carry
                    352:        ldq     r2,     -16(r17)        C L1
                    353:
                    354:        umulh   r19,    r3,     r8      C U1
                    355:        cmpult  r5,     r11,    r20     C L0 lo add => carry
                    356:        subq    r23,    r10,    r28     C U0 hi add => answer
                    357:        ldq     r5,     40(r16)         C L1
                    358:
                    359:        mulq    r19,    r0,     r9      C U1
                    360:        cmpult  r23,    r10,    r21     C L0 hi add => carry
                    361:        addq    r12,    r20,    r12     C U0 hi mul + carry
                    362:        ldq     r3,     -8(r17)         C L1
                    363:
                    364:        umulh   r19,    r0,     r10     C U1
                    365:        subq    r6,     r13,    r13     C L0 lo + acc
                    366:        stq     r22,    0(r16)          C L0
                    367:        stq     r28,    8(r16)          C L1
                    368:
                    369:        bis     r31,    r31,    r31     C L0 st slosh
                    370:        mulq    r19,    r1,     r11     C U1
                    371:        bis     r31,    r31,    r31     C L1 st slosh
                    372:        addq    r12,    r21,    r12     C U0 hi mul + carry
                    373:
                    374:        cmpult  r6,     r13,    r20     C L0 lo add => carry
                    375:        bis     r31,    r31,    r31     C U1 mt
                    376:        lda     r16,    64(r16)         C L1 bookkeeping
                    377:        subq    r13,    r12,    r22     C U0 hi add => answer
                    378:
                    379:        bis     r31,    r31,    r31     C U1 mt
                    380:        cmpult  r13,    r12,    r21     C L0 hi add => carry
                    381:        addq    r14,    r20,    r14     C U0 hi mul + carry
                    382:        ldq     r6,     -16(r16)        C L1
                    383:
                    384:        bis     r31,    r31,    r31     C U1 mt
                    385:        subq    r7,     r15,    r23     C L0 lo + acc
                    386:        addq    r14,    r21,    r14     C U0 hi mul + carry
                    387:        ldq     r0,     0(r17)          C L1
                    388:
                    389:        umulh   r19,    r1,     r12     C U1
                    390:        cmpult  r7,     r15,    r20     C L0 lo add => carry
                    391:        subq    r23,    r14,    r28     C U0 hi add => answer
                    392:        ldq     r7,     -8(r16)         C L1
                    393:
                    394:        mulq    r19,    r2,     r13     C U1
                    395:        cmpult  r23,    r14,    r21     C L0 hi add => carry
                    396:        addq    r8,     r20,    r8      C U0 hi mul + carry
                    397:        ldq     r1,     8(r17)          C L1
                    398:
                    399:        umulh   r19,    r2,     r14     C U1
                    400:        subq    r4,     r9,     r9      C L0 lo + acc
                    401:        stq     r22,    -48(r16)        C L0
                    402:        stq     r28,    -40(r16)        C L1
                    403:
                    404:        bis     r31,    r31,    r31     C L0 st slosh
                    405:        mulq    r19,    r3,     r15     C U1
                    406:        bis     r31,    r31,    r31     C L1 st slosh
                    407:        addq    r8,     r21,    r8      C U0 hi mul + carry
                    408:
                    409:        cmpult  r4,     r9,     r20     C L0 lo add => carry
                    410:        subq    r9,     r8,     r22     C U0 hi add => answer
                    411:        bis     r31,    r31,    r31     C L1 mt
                    412:        bgt     r18,    $Loop           C U1 bookkeeping
                    413:
                    414: C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
                    415: $Lend:
                    416:        cmpult  r9,     r8,     r21     C L0 hi add => carry
                    417:        addq    r10,    r20,    r10     C U0 hi mul + carry
                    418:        ldq     r4,     0(r16)          C L1
                    419:        subq    r5,     r11,    r23     C L0 lo + acc
                    420:        addq    r10,    r21,    r10     C L0 hi mul + carry
                    421:        umulh   r19,    r3,     r8      C U1
                    422:        cmpult  r5,     r11,    r20     C L0 lo add => carry
                    423:        subq    r23,    r10,    r28     C U0 hi add => answer
                    424:        ldq     r5,     8(r16)          C L1
                    425:        mulq    r19,    r0,     r9      C U1
                    426:        cmpult  r23,    r10,    r21     C L0 hi add => carry
                    427:        addq    r12,    r20,    r12     C U0 hi mul + carry
                    428:        umulh   r19,    r0,     r10     C U1
                    429:        subq    r6,     r13,    r13     C L0 lo + acc
                    430:        stq     r22,    -32(r16)        C L0
                    431:        stq     r28,    -24(r16)        C L1
                    432:        mulq    r19,    r1,     r11     C U1
                    433:        addq    r12,    r21,    r12     C U0 hi mul + carry
                    434:        cmpult  r6,     r13,    r20     C L0 lo add => carry
                    435:        subq    r13,    r12,    r22     C U0 hi add => answer
                    436:        cmpult  r13,    r12,    r21     C L0 hi add => carry
                    437:        addq    r14,    r20,    r14     C U0 hi mul + carry
                    438:        subq    r7,     r15,    r23     C L0 lo + acc
                    439:        addq    r14,    r21,    r14     C U0 hi mul + carry
                    440:        umulh   r19,    r1,     r12     C U1
                    441:        cmpult  r7,     r15,    r20     C L0 lo add => carry
                    442:        subq    r23,    r14,    r28     C U0 hi add => answer
                    443:        cmpult  r23,    r14,    r21     C L0 hi add => carry
                    444:        addq    r8,     r20,    r8      C U0 hi mul + carry
                    445:        subq    r4,     r9,     r9      C U0 lo + acc
                    446:        stq     r22,    -16(r16)        C L0
                    447:        stq     r28,    -8(r16)         C L1
                    448:        addq    r8,     r21,    r8      C L0 hi mul + carry
                    449:        cmpult  r4,     r9,     r20     C L0 lo add => carry
                    450:        subq    r9,     r8,     r22     C U0 hi add => answer
                    451:        cmpult  r9,     r8,     r21     C L0 hi add => carry
                    452:        addq    r10,    r20,    r10     C U0 hi mul + carry
                    453:        subq    r5,     r11,    r23     C L0 lo + acc
                    454:        addq    r10,    r21,    r10     C L0 hi mul + carry
                    455:        cmpult  r5,     r11,    r20     C L0 lo add => carry
                    456:        subq    r23,    r10,    r28     C U0 hi add => answer
                    457:        cmpult  r23,    r10,    r21     C L0 hi add => carry
                    458:        addq    r12,    r20,    r12     C U0 hi mul + carry
                    459:        stq     r22,    0(r16)          C L0
                    460:        stq     r28,    8(r16)          C L1
                    461:        addq    r12,    r21,    r0      C U0 hi mul + carry
                    462:
                    463:        ldq     $9,     8($30)
                    464:        ldq     $10,    16($30)
                    465:        ldq     $11,    24($30)
                    466:        ldq     $12,    32($30)
                    467:        ldq     $13,    40($30)
                    468:        ldq     $14,    48($30)
                    469:        ldq     $15,    56($30)
                    470:        lda     $30,    240($30)
                    471:        ret     r31,    (r26),  1
                    472: EPILOGUE(mpn_submul_1)
                    473: ASM_END()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>