// image ('work_unit_size').
// TODO(andydavis)
- // *) Get L3 cache size from device at runtime (30MB is from ivybridge).
// *) Consider reducing 'target_working_set_size' if L3 is shared by
// other concurrently running tensorflow ops.
- const size_t target_working_set_size = (30LL << 20) / sizeof(T);
+ const size_t target_working_set_size = Eigen::l3CacheSize() / sizeof(T);
const size_t size_A = output_image_size * filter_total_size;
const int output_image_size =
dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
- // TODO(andydavis) Get L2/L3 cache sizes from device.
- const size_t l2_cache_size = 256LL << 10;
- const size_t l3_cache_size = 30LL << 20;
+ const size_t l2_cache_size = Eigen::l2CacheSize();
+ const size_t l3_cache_size = Eigen::l3CacheSize();
// Use L3 cache size as target working set size.
const size_t target_working_set_size = l3_cache_size / sizeof(T);
// Calculate filter transform batch based on cache/filter sizes.
- // Cache budget (based on L2 cache size = 256KB).
- // TODO(andydavis) Read cache size from system.
- const int64 cache_size = (256LL << 10) / sizeof(T);
+ // Cache budget (based on L2 cache size).
+ const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
// Fixed cost.
const int64 filter_transform_matrix_size =
const int64 filter_shard_size = filter_shards_row * filter_shards_col;
const int64 out_tile_spatial_size = out_tile_rows * out_tile_cols;
- // Cache budget (based on L2 cache size = 256KB).
- // TODO(andydavis) Read cache size from the system.
- const int64 cache_size = (256LL << 10) / sizeof(T);
+ // Cache budget (based on L2 cache size).
+ const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
// Fixed costs.
const int64 tile_transform_matrix_size =