arch/alpha/lib/ev6-copy_page.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * arch/alpha/lib/ev6-copy_page.S
   4  *
   5  * Copy an entire page.
   6  */
   7
   8 /* The following comparison of this routine vs the normal copy_page.S
   9    was written by an unnamed ev6 hardware designer and forwarded to me
  10    via Steven Hobbs <hobbs@steven.zko.dec.com>.
  11
  12    First Problem: STQ overflows.
  13    -----------------------------
  14
  15         It would be nice if EV6 handled every resource overflow efficiently,
  16         but for some it doesn't.  Including store queue overflows.  It causes
  17         a trap and a restart of the pipe.
  18
  19         To get around this we sometimes use (to borrow a term from a VSSAD
  20         researcher) "aeration".  The idea is to slow the rate at which the
  21         processor receives valid instructions by inserting nops in the fetch
  22         path.  In doing so, you can prevent the overflow and actually make
  23         the code run faster.  You can, of course, take advantage of the fact
  24         that the processor can fetch at most 4 aligned instructions per cycle.
  25
  26         I inserted enough nops to force it to take 10 cycles to fetch the
  27         loop code.  In theory, EV6 should be able to execute this loop in
  28         9 cycles but I was not able to get it to run that fast -- the initial
  29         conditions were such that I could not reach this optimum rate on
  30         (chaotic) EV6.  I wrote the code such that everything would issue
  31         in order.
  32
  33    Second Problem: Dcache index matches.
  34    -------------------------------------
  35
  36         If you are going to use this routine on random aligned pages, there
  37         is a 25% chance that the pages will be at the same dcache indices.
  38         This results in many nasty memory traps without care.
  39
  40         The solution is to schedule the prefetches to avoid the memory
  41         conflicts.  I schedule the wh64 prefetches farther ahead of the
  42         read prefetches to avoid this problem.
  43
  44    Third Problem: Needs more prefetching.
  45    --------------------------------------
  46
  47         In order to improve the code I added deeper prefetching to take the
  48         most advantage of EV6's bandwidth.
  49
  50         I also prefetched the read stream. Note that adding the read prefetch
  51         forced me to add another cycle to the inner-most kernel - up to 11
  52         from the original 8 cycles per iteration.  We could improve performance
  53         further by unrolling the loop and doing multiple prefetches per cycle.
  54
  55    I think that the code below will be very robust and fast code for the
  56    purposes of copying aligned pages.  It is slower when both source and
  57    destination pages are in the dcache, but it is my guess that this is
  58    less important than the dcache miss case.  */
  59
  60 #include <asm/export.h>
  61         .text
  62         .align 4
  63         .global copy_page
  64         .ent copy_page
  65 copy_page:
  66         .prologue 0
  67
  68         /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
  69         wh64    ($16)
  70         ldl     $31,0($17)
  71         ldl     $31,64($17)
  72         lda     $1,1*64($16)
  73
  74         wh64    ($1)
  75         ldl     $31,128($17)
  76         ldl     $31,192($17)
  77         lda     $1,2*64($16)
  78
  79         wh64    ($1)
  80         ldl     $31,256($17)
  81         lda     $18,118
  82         lda     $1,3*64($16)
  83
  84         wh64    ($1)
  85         nop
  86         lda     $1,4*64($16)
  87         lda     $2,5*64($16)
  88
  89         wh64    ($1)
  90         wh64    ($2)
  91         lda     $1,6*64($16)
  92         lda     $2,7*64($16)
  93
  94         wh64    ($1)
  95         wh64    ($2)
  96         lda     $1,8*64($16)
  97         lda     $2,9*64($16)
  98
  99         wh64    ($1)
 100         wh64    ($2)
 101         lda     $19,10*64($16)
 102         nop
 103
 104         /* Main prefetching/write-hinting loop.  */
 105 1:      ldq     $0,0($17)
 106         ldq     $1,8($17)
 107         unop
 108         unop
 109
 110         unop
 111         unop
 112         ldq     $2,16($17)
 113         ldq     $3,24($17)
 114
 115         ldq     $4,32($17)
 116         ldq     $5,40($17)
 117         unop
 118         unop
 119
 120         unop
 121         unop
 122         ldq     $6,48($17)
 123         ldq     $7,56($17)
 124
 125         ldl     $31,320($17)
 126         unop
 127         unop
 128         unop
 129
 130         /* This gives the extra cycle of aeration above the minimum.  */
 131         unop
 132         unop
 133         unop
 134         unop
 135
 136         wh64    ($19)
 137         unop
 138         unop
 139         unop
 140
 141         stq     $0,0($16)
 142         subq    $18,1,$18
 143         stq     $1,8($16)
 144         unop
 145
 146         unop
 147         stq     $2,16($16)
 148         addq    $17,64,$17
 149         stq     $3,24($16)
 150
 151         stq     $4,32($16)
 152         stq     $5,40($16)
 153         addq    $19,64,$19
 154         unop
 155
 156         stq     $6,48($16)
 157         stq     $7,56($16)
 158         addq    $16,64,$16
 159         bne     $18, 1b
 160
 161         /* Prefetch the final 5 cache lines of the read stream.  */
 162         lda     $18,10
 163         ldl     $31,320($17)
 164         ldl     $31,384($17)
 165         ldl     $31,448($17)
 166
 167         ldl     $31,512($17)
 168         ldl     $31,576($17)
 169         nop
 170         nop
 171
 172         /* Non-prefetching, non-write-hinting cleanup loop for the
 173            final 10 cache lines.  */
 174 2:      ldq     $0,0($17)
 175         ldq     $1,8($17)
 176         ldq     $2,16($17)
 177         ldq     $3,24($17)
 178
 179         ldq     $4,32($17)
 180         ldq     $5,40($17)
 181         ldq     $6,48($17)
 182         ldq     $7,56($17)
 183
 184         stq     $0,0($16)
 185         subq    $18,1,$18
 186         stq     $1,8($16)
 187         addq    $17,64,$17
 188
 189         stq     $2,16($16)
 190         stq     $3,24($16)
 191         stq     $4,32($16)
 192         stq     $5,40($16)
 193
 194         stq     $6,48($16)
 195         stq     $7,56($16)
 196         addq    $16,64,$16
 197         bne     $18, 2b
 198
 199         ret
 200         nop
 201         unop
 202         nop
 203
 204         .end copy_page
 205         EXPORT_SYMBOL(copy_page)