1 /* --------------------------------------------------------------- */
2 /* PLEASE DO NOT MODIFY THIS SECTION */
3 /* This prolog section is automatically generated. */
5 /* (C) Copyright 2001,2006, */
6 /* International Business Machines Corporation, */
8 /* All Rights Reserved. */
9 /* --------------------------------------------------------------- */
10 /* PROLOG END TAG zYx */
13 * Copyright (C) 2006 IBM Corp.
15 * "Optimized" lookup operations for n-way set associative
16 * software managed cache.
18 #include <spu_intrinsics.h>
20 #ifndef __SPE_CACHE_NWAY_OPT_H_
21 #define __SPE_CACHE_NWAY_OPT_H_
24 * Look up and return data from the cache. If the data
25 * is not currently in cache then transfer it from main
28 * This code uses a conditional branch to the cache miss
29 * handler in the event that the requested data is not
30 * in the cache. A branch hint is used to avoid paying
31 * the branch stall penalty.
33 #define __spe_cache_rd(type, ea) \
35 int set, idx, lnum, byte; \
37 _spe_cache_nway_lookup_(ea, set, idx); \
38 if (unlikely(idx < 0)) { \
39 idx = _spe_cache_miss_(ea, set, -1); \
40 spu_writech(22, SPE_CACHE_SET_TAGMASK(set)); \
41 spu_mfcstat(MFC_TAG_UPDATE_ALL); \
43 lnum = _spe_cacheline_num_(set, idx); \
44 byte = _spe_cacheline_byte_offset_(ea); \
45 ret = *((type *) (&spe_cache_mem[lnum + byte])); \
51 * Fetch four data elements from the cache.
53 * This code uses one conditional branch in
54 * the event that any of the four elements
57 * On a miss, light weight locking is used to
58 * avoid casting out entries that were found.
59 * Further, we wait just once for the transfers,
60 * allowing for parallel [rather than serial]
64 #define __spe_cache_rd_x4(type, ea_x4) \
66 vector unsigned int missing; \
68 vector unsigned int cindex; \
69 unsigned int d0, d1, d2, d3; \
70 vector unsigned int s_x4; \
71 vector signed int i_x4; \
72 vector unsigned int ibyte, iline; \
73 vector unsigned int ret; \
74 unsigned int idx0, idx1, idx2, idx3; \
76 _spe_cache_nway_lookup_x4(ea_x4, s_x4, i_x4); \
77 missing = spu_rlmask ((vector unsigned int)i_x4, -8); \
78 ms = spu_extract (spu_gather (missing), 0); \
80 ibyte = _spe_cacheline_byte_offset_x4(ea_x4); \
82 iline = _spe_cacheline_num_x4(s_x4, \
83 (vector unsigned int)i_x4); \
85 cindex = spu_add (iline, ibyte); \
87 idx0 = spu_extract (cindex, 0); \
88 idx1 = spu_extract (cindex, 1); \
89 idx2 = spu_extract (cindex, 2); \
90 idx3 = spu_extract (cindex, 3); \
92 d0 = *((type *) (&spe_cache_mem[idx0])); \
93 d1 = *((type *) (&spe_cache_mem[idx1])); \
94 d2 = *((type *) (&spe_cache_mem[idx2])); \
95 d3 = *((type *) (&spe_cache_mem[idx3])); \
97 ret = _load_vec_uint4 (d0, d1, d2, d3); \
100 int b0 = spu_extract (ibyte, 0); \
101 int b1 = spu_extract (ibyte, 1); \
102 int b2 = spu_extract (ibyte, 2); \
103 int b3 = spu_extract (ibyte, 3); \
108 int s0 = spu_extract (s_x4, 0); \
109 int s1 = spu_extract (s_x4, 1); \
110 int s2 = spu_extract (s_x4, 2); \
111 int s3 = spu_extract (s_x4, 3); \
112 int i0 = spu_extract (i_x4, 0); \
113 int i1 = spu_extract (i_x4, 1); \
114 int i2 = spu_extract (i_x4, 2); \
115 int i3 = spu_extract (i_x4, 3); \
116 unsigned int ea0 = spu_extract(ea_x4, 0); \
117 unsigned int ea1 = spu_extract(ea_x4, 1); \
118 unsigned int ea2 = spu_extract(ea_x4, 2); \
119 unsigned int ea3 = spu_extract(ea_x4, 3); \
122 avail &= ~(((i0 < 0) ? 0 : (1 << i0)) | \
123 ((i1 < 0) ? 0 : (1 << i1)) | \
124 ((i2 < 0) ? 0 : (1 << i2)) | \
125 ((i3 < 0) ? 0 : (1 << i3))); \
127 i0 = _spe_cache_miss_(ea0, s0, avail); \
128 avail &= ~(1 << i0); \
129 i1 = _spe_cache_miss_(ea1, s1, avail); \
130 avail &= ~(1 << i1); \
131 i2 = _spe_cache_miss_(ea2, s2, avail); \
132 avail &= ~(1 << i2); \
133 i3 = _spe_cache_miss_(ea3, s3, avail); \
135 lnum0 = _spe_cacheline_num_(s0, i0); \
136 lnum1 = _spe_cacheline_num_(s1, i1); \
137 lnum2 = _spe_cacheline_num_(s2, i2); \
138 lnum3 = _spe_cacheline_num_(s3, i3); \
140 spu_writech(22, SPE_CACHE_SET_TAGMASK(set)); \
141 spu_mfcstat(MFC_TAG_UPDATE_ALL); \
143 d0 = *((type *) (&spe_cache_mem[lnum0 + b0])); \
144 d1 = *((type *) (&spe_cache_mem[lnum1 + b1])); \
145 d2 = *((type *) (&spe_cache_mem[lnum2 + b2])); \
146 d3 = *((type *) (&spe_cache_mem[lnum3 + b3])); \
148 ret = _load_vec_uint4 (d0, d1, d2, d3); \
153 #endif /* _SPE_CACHE_NWAY_OPT_H_ */