[BACK]Return to sub_n.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / alpha / ev6

Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/sub_n.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
                      2: dnl  and store difference in a third limb vector.
                      3:
                      4: dnl  Copyright 2000 Free Software Foundation, Inc.
                      5:
                      6: dnl  This file is part of the GNU MP Library.
                      7:
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or modify
                      9: dnl  it under the terms of the GNU Lesser General Public License as published
                     10: dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
                     11: dnl  your option) any later version.
                     12:
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful, but
                     14: dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     16: dnl  License for more details.
                     17:
                     18: dnl  You should have received a copy of the GNU Lesser General Public License
                     19: dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: dnl  MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25: dnl  INPUT PARAMETERS
                     26: dnl  res_ptr   r16
                     27: dnl  s1_ptr    r17
                     28: dnl  s2_ptr    r18
                     29: dnl  size      r19
                     30:
                     31: dnl  This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
                     32:
                     33: dnl This code was written in close cooperation with ev6 pipeline expert
                     34: dnl Steve Root.  Any errors are tege's fault, though.
                     35:
                     36: dnl  work triplet  0-2
                     37: dnl  work triplet  3-5
                     38: dnl  work triplet  6-8
                     39: dnl  work triplet  9-11
                     40: dnl  carry's 20-23
                     41:
                     42: dnl  sustains 8 subtracts in 17 cycles !
                     43: dnl   (from the d_cache)
                     44:
                     45: dnl  pair loads and stores where possible
                     46: dnl  store pairs oct-aligned where possible
                     47: dnl    (didn't need it here)
                     48: dnl  stores are delayed every third cycle
                     49: dnl  loads and stores are delayed by fills
                     50: dnl  U stays still, put code there where possible
                     51: dnl   (note alternation of U1 and U0)
                     52: dnl  L moves because of loads and stores
                     53: dnl  note dampers in L to limit damage
                     54: dnl  note, load ahead of time where possible
                     55:
                     56: dnl  this odd-looking optimization expects
                     57: dnl  that were having random bits in our data, so
                     58: dnl  that a pure zero result is unlikely. so we
                     59: dnl  penalize the unlikely case to help the
                     60: dnl  common case.
                     61:
                     62: ASM_START()
                     63: PROLOGUE(mpn_sub_n)
                     64:        lda     r30,    -240(r30)
                     65:        stq     r9,     8(r30)
                     66:        stq     r10,    16(r30)
                     67:        stq     r11,    24(r30)
                     68:
                     69:        lda     r19,    -8(r19)         C L1 move counter
                     70:
                     71:        bis     r31,    r31,    r23
                     72:        blt     r19,    $Lsmall
                     73:
                     74:        ldq     r0,     0(r17)          C L0 get next ones
                     75:        ldq     r1,     0(r18)          C L1
                     76:        ldq     r3,     8(r17)          C L0 get next ones
                     77:        ldq     r4,     8(r18)          C L1
                     78:        ldq     r6,     16(r17)         C L0 get next ones
                     79:        ldq     r7,     16(r18)         C L1
                     80:
                     81:        ldq     r9,     24(r17)         C L0 get next ones
                     82:        ldq     r10,    24(r18)         C L1
                     83:
                     84:        subq    r0,     r1,     r2      C U1 sub two data
                     85:
                     86:        cmpult  r0,     r1,     r20     C U1 did it borrow
                     87:
                     88:        ldq     r0,     32(r17)         C L0 get next ones
                     89:        ldq     r1,     32(r18)         C L1
                     90:
                     91:        subq    r3,     r4,     r5      C U0 sub two data
                     92:
                     93:        cmpult  r3,     r4,     r21     C U0 did it borrow
                     94:        ldq     r3,     40(r17)         C L0 get next ones
                     95:        ldq     r4,     40(r18)         C L1
                     96:
                     97:        subq    r6,     r7,     r8      C U1 sub two data
                     98:        subq    r5,     r20,    r24     C U0 borrow from last
                     99:        stq     r2,     0(r16)          C L1
                    100:
                    101:        cmpult  r6,     r7,     r22     C U1 did it borrow
                    102:        beq     r5,     $fix5w          C U0 fix exact zero
                    103: $ret5w:        ldq     r6,     48(r17)         C L0 get next ones
                    104:        ldq     r7,     48(r18)         C L1
                    105:
                    106:        bis     r31,    r31,    r31     C L  damp out
                    107:        subq    r8,     r21,    r25     C U1 borrow from last
                    108:        bis     r31,    r31,    r31     C L  moves in L !
                    109:        subq    r9,     r10,    r11     C U0 sub two data
                    110:
                    111:        beq     r8,     $fix6w          C U1 fix exact zero
                    112: $ret6w:        cmpult  r9,     r10,    r23     C U0 did it borrow
                    113:        ldq     r9,     56(r17)         C L0 get next ones
                    114:        ldq     r10,    56(r18)         C L1
                    115:
                    116:        lda     r17,    64(r17)         C L0 move pointer
                    117:        bis     r31,    r31,    r31     C U
                    118:        lda     r18,    64(r18)         C L1 move pointer
                    119:
                    120:        lda     r19,    -8(r19)         C L1 move counter
                    121:        blt     r19,    $Lend
                    122:
                    123: C Main loop.  8-way unrolled.
                    124:        ALIGN(8)
                    125: $Loop:
                    126:        subq    r0,     r1,     r2      C U1 sub two data
                    127:        stq     r24,    8(r16)          C L0 put an answer
                    128:        subq    r11,    r22,    r24     C U0 borrow from last
                    129:        stq     r25,    16(r16)         C L1 pair
                    130:
                    131:        cmpult  r0,     r1,     r20     C U1 did it borrow
                    132:        beq     r11,    $fix7           C U0 fix exact 0
                    133: $ret7: ldq     r0,     0(r17)          C L0 get next ones
                    134:        ldq     r1,     0(r18)          C L1
                    135:
                    136:        bis     r31,    r31,    r31     C L  damp out
                    137:        subq    r2,     r23,    r25     C U1 borrow from last
                    138:        bis     r31,    r31,    r31     C L  moves in L !
                    139:        subq    r3,     r4,     r5      C U0 sub two data
                    140:
                    141:        beq     r2,     $fix0           C U1 fix exact zero
                    142: $ret0: cmpult  r3,     r4,     r21     C U0 did it borrow
                    143:        ldq     r3,     8(r17)          C L0 get next ones
                    144:        ldq     r4,     8(r18)          C L1
                    145:
                    146:        subq    r6,     r7,     r8      C U1 sub two data
                    147:        stq     r24,    24(r16)         C L0 store pair
                    148:        subq    r5,     r20,    r24     C U0 borrow from last
                    149:        stq     r25,    32(r16)         C L1
                    150:
                    151:        cmpult  r6,     r7,     r22     C U1 did it borrow
                    152:        beq     r5,     $fix1           C U0 fix exact zero
                    153: $ret1: ldq     r6,     16(r17)         C L0 get next ones
                    154:        ldq     r7,     16(r18)         C L1
                    155:
                    156:        lda     r16,    64(r16)         C L0 move pointer
                    157:        subq    r8,     r21,    r25     C U1 borrow from last
                    158:        lda     r19,    -8(r19)         C L1 move counter
                    159:        subq    r9,     r10,    r11     C U0 sub two data
                    160:
                    161:        beq     r8,     $fix2           C U1 fix exact zero
                    162: $ret2: cmpult  r9,     r10,    r23     C U0 did it borrow
                    163:        ldq     r9,     24(r17)         C L0 get next ones
                    164:        ldq     r10,    24(r18)         C L1
                    165:
                    166:        subq    r0,     r1,     r2      C U1 sub two data
                    167:        stq     r24,    -24(r16)        C L0 put an answer
                    168:        subq    r11,    r22,    r24     C U0 borrow from last
                    169:        stq     r25,    -16(r16)        C L1 pair
                    170:
                    171:        cmpult  r0,     r1,     r20     C U1 did it borrow
                    172:        beq     r11,    $fix3           C U0 fix exact 0
                    173: $ret3: ldq     r0,     32(r17)         C L0 get next ones
                    174:        ldq     r1,     32(r18)         C L1
                    175:
                    176:        bis     r31,    r31,    r31     C L  damp out
                    177:        subq    r2,     r23,    r25     C U1 borrow from last
                    178:        bis     r31,    r31,    r31     C L  moves in L !
                    179:        subq    r3,     r4,     r5      C U0 sub two data
                    180:
                    181:        beq     r2,     $fix4           C U1 fix exact zero
                    182: $ret4: cmpult  r3,     r4,     r21     C U0 did it borrow
                    183:        ldq     r3,     40(r17)         C L0 get next ones
                    184:        ldq     r4,     40(r18)         C L1
                    185:
                    186:        subq    r6,     r7,     r8      C U1 sub two data
                    187:        stq     r24,    -8(r16)         C L0 store pair
                    188:        subq    r5,     r20,    r24     C U0 borrow from last
                    189:        stq     r25,    0(r16)          C L1
                    190:
                    191:        cmpult  r6,     r7,     r22     C U1 did it borrow
                    192:        beq     r5,     $fix5           C U0 fix exact zero
                    193: $ret5: ldq     r6,     48(r17)         C L0 get next ones
                    194:        ldq     r7,     48(r18)         C L1
                    195:
                    196:        bis     r31,    r31,    r31     C L  damp out
                    197:        subq    r8,     r21,    r25     C U1 borrow from last
                    198:        bis     r31,    r31,    r31     C L  moves in L !
                    199:        subq    r9,     r10,    r11     C U0 sub two data
                    200:
                    201:        beq     r8,     $fix6           C U1 fix exact zero
                    202: $ret6: cmpult  r9,     r10,    r23     C U0 did it borrow
                    203:        ldq     r9,     56(r17)         C L0 get next ones
                    204:        ldq     r10,    56(r18)         C L1
                    205:
                    206:        lda     r17,    64(r17)         C L0 move pointer
                    207:        bis     r31,    r31,    r31     C U
                    208:        lda     r18,    64(r18)         C L1 move pointer
                    209:        bge     r19,    $Loop           C U1 loop control
                    210: C ==== main loop end
                    211:
                    212: $Lend:
                    213:        subq    r0,     r1,     r2      C U1 sub two data
                    214:        stq     r24,    8(r16)          C L0 put an answer
                    215:        subq    r11,    r22,    r24     C U0 borrow from last
                    216:        stq     r25,    16(r16)         C L1 pair
                    217:
                    218:        cmpult  r0,     r1,     r20     C U1 did it borrow
                    219:        beq     r11,    $fix7c          C U0 fix exact 0
                    220: $ret7c:
                    221:        subq    r2,     r23,    r25     C U1 borrow from last
                    222:        subq    r3,     r4,     r5      C U0 sub two data
                    223:
                    224:        beq     r2,     $fix0c          C U1 fix exact zero
                    225: $ret0c:        cmpult  r3,     r4,     r21     C U0 did it borrow
                    226:
                    227:        subq    r6,     r7,     r8      C U1 sub two data
                    228:        stq     r24,    24(r16)         C L0 store pair
                    229:        subq    r5,     r20,    r24     C U0 borrow from last
                    230:        stq     r25,    32(r16)         C L1
                    231:
                    232:        cmpult  r6,     r7,     r22     C U1 did it borrow
                    233:        beq     r5,     $fix1c          C U0 fix exact zero
                    234: $ret1c:
                    235:        lda     r16,    64(r16)         C L0 move pointer
                    236:        subq    r8,     r21,    r25     C U1 borrow from last
                    237:        subq    r9,     r10,    r11     C U0 sub two data
                    238:
                    239:        beq     r8,     $fix2c          C U1 fix exact zero
                    240: $ret2c:        cmpult  r9,     r10,    r23     C U0 did it borrow
                    241:
                    242:        stq     r24,    -24(r16)        C L0 put an answer
                    243:        subq    r11,    r22,    r24     C U0 borrow from last
                    244:        stq     r25,    -16(r16)        C L1 pair
                    245:
                    246:        beq     r11,    $fix3c          C U0 fix exact 0
                    247: $ret3c:
                    248:        stq     r24,    -8(r16)         C L0 store pair
                    249:
                    250:
                    251: $Lsmall:
                    252:        lda     r19,    8(r19)
                    253:        beq     r19,    $Lret
                    254:
                    255:        ldq     r0,     0(r17)
                    256:        ldq     r1,     0(r18)
                    257:        lda     r19,    -1(r19)
                    258:        beq     r19,    $Lend0
                    259:
                    260:        ALIGN(8)
                    261: $Loop0:        subq    r0,     r1,     r2      C main sub
                    262:        cmpult  r0,     r1,     r8      C compute bw from last sub
                    263:        ldq     r0,     8(r17)
                    264:        ldq     r1,     8(r18)
                    265:        subq    r2,     r23,    r20     C borrow sub
                    266:        lda     r17,    8(r17)
                    267:        lda     r18,    8(r18)
                    268:        stq     r20,    0(r16)
                    269:        cmpult  r2,     r23,    r23     C compute bw from last sub
                    270:        lda     r19,    -1(r19)         C decr loop cnt
                    271:        bis     r8,     r23,    r23     C combine bw from the two subs
                    272:        lda     r16,    8(r16)
                    273:        bne     r19,    $Loop0
                    274: $Lend0:        subq    r0,     r1,     r2      C main sub
                    275:        cmpult  r0,     r1,     r8      C compute bw from last sub
                    276:        subq    r2,     r23,    r20     C borrow sub
                    277:        cmpult  r2,     r23,    r23     C compute bw from last sub
                    278:        stq     r20,    0(r16)
                    279:        bis     r8,     r23,    r23     C combine bw from the two subs
                    280:
                    281: $Lret:
                    282:        lda     r0,     0(r23)          C copy borrow into return register
                    283:
                    284:        ldq     r9,     8(r30)
                    285:        ldq     r10,    16(r30)
                    286:        ldq     r11,    24(r30)
                    287:        lda     r30,    240(r30)
                    288:        ret     r31,(r26),1
                    289:
                    290:
                    291: $fix5w:        bis     r21,    r20,    r21     C bring forward borrow
                    292:        br      r31,    $ret5w
                    293: $fix6w:        bis     r22,    r21,    r22     C bring forward borrow
                    294:        br      r31,    $ret6w
                    295: $fix0: bis     r20,    r23,    r20     C bring forward borrow
                    296:        br      r31,    $ret0
                    297: $fix1: bis     r21,    r20,    r21     C bring forward borrow
                    298:        br      r31,    $ret1
                    299: $fix2: bis     r22,    r21,    r22     C bring forward borrow
                    300:        br      r31,    $ret2
                    301: $fix3: bis     r23,    r22,    r23     C bring forward borrow
                    302:        br      r31,    $ret3
                    303: $fix4: bis     r20,    r23,    r20     C bring forward borrow
                    304:        br      r31,    $ret4
                    305: $fix5: bis     r20,    r21,    r21     C bring forward borrow
                    306:        br      r31,    $ret5
                    307: $fix6: bis     r22,    r21,    r22     C bring forward borrow
                    308:        br      r31,    $ret6
                    309: $fix7: bis     r23,    r22,    r23     C bring forward borrow
                    310:        br      r31,    $ret7
                    311: $fix0c:        bis     r20,    r23,    r20     C bring forward borrow
                    312:        br      r31,    $ret0c
                    313: $fix1c:        bis     r21,    r20,    r21     C bring forward borrow
                    314:        br      r31,    $ret1c
                    315: $fix2c:        bis     r22,    r21,    r22     C bring forward borrow
                    316:        br      r31,    $ret2c
                    317: $fix3c:        bis     r23,    r22,    r23     C bring forward borrow
                    318:        br      r31,    $ret3c
                    319: $fix7c:        bis     r23,    r22,    r23     C bring forward borrow
                    320:        br      r31,    $ret7c
                    321:
                    322: EPILOGUE(mpn_sub_n)
                    323: ASM_END()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>