along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
-/* The purpose of the store merging pass is to combine multiple memory
- stores of constant values, values loaded from memory or bitwise operations
- on those to consecutive memory locations into fewer wider stores.
+/* The purpose of the store merging pass is to combine multiple memory stores
+ of constant values, values loaded from memory, bitwise operations on those,
+ or bit-field values, to consecutive locations, into fewer wider stores.
+
For example, if we have a sequence peforming four byte stores to
consecutive memory locations:
[p ] := imm1;
[p + 2B] := imm3;
[p + 3B] := imm4;
we can transform this into a single 4-byte store if the target supports it:
- [p] := imm1:imm2:imm3:imm4 //concatenated immediates according to endianness.
+ [p] := imm1:imm2:imm3:imm4 concatenated according to endianness.
Or:
[p ] := [q ];
if there is no overlap can be transformed into a single 4-byte
load, xored with imm1:imm2:imm3:imm4 and stored using a single 4-byte store.
+ Or:
+ [p:1 ] := imm;
+ [p:31] := val & 0x7FFFFFFF;
+ we can transform this into a single 4-byte store if the target supports it:
+ [p] := imm:(val & 0x7FFFFFFF) concatenated according to endianness.
+
The algorithm is applied to each basic block in three phases:
- 1) Scan through the basic block recording assignments to
- destinations that can be expressed as a store to memory of a certain size
- at a certain bit offset from expressions we can handle. For bit-fields
- we also note the surrounding bit region, bits that could be stored in
+ 1) Scan through the basic block and record assignments to destinations
+ that can be expressed as a store to memory of a certain size at a certain
+ bit offset from base expressions we can handle. For bit-fields we also
+ record the surrounding bit region, i.e. bits that could be stored in
a read-modify-write operation when storing the bit-field. Record store
chains to different bases in a hash_map (m_stores) and make sure to
terminate such chains when appropriate (for example when when the stored
etc. A store_immediate_info object is recorded for every such store.
Record as many such assignments to a single base as possible until a
statement that interferes with the store sequence is encountered.
- Each store has up to 2 operands, which can be an immediate constant
- or a memory load, from which the value to be stored can be computed.
+ Each store has up to 2 operands, which can be a either constant, a memory
+ load or an SSA name, from which the value to be stored can be computed.
At most one of the operands can be a constant. The operands are recorded
in store_operand_info struct.
- 2) Analyze the chain of stores recorded in phase 1) (i.e. the vector of
+ 2) Analyze the chains of stores recorded in phase 1) (i.e. the vector of
store_immediate_info objects) and coalesce contiguous stores into
- merged_store_group objects. For bit-fields stores, we don't need to
+ merged_store_group objects. For bit-field stores, we don't need to
require the stores to be contiguous, just their surrounding bit regions
have to be contiguous. If the expression being stored is different
between adjacent stores, such as one store storing a constant and
multiple stores per store group to handle contiguous stores that are not
of a size that is a power of 2. For example it can try to emit a 40-bit
store as a 32-bit store followed by an 8-bit store.
- We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT or
- TARGET_SLOW_UNALIGNED_ACCESS rules.
+ We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT
+ or TARGET_SLOW_UNALIGNED_ACCESS settings.
Note on endianness and example:
Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores:
#include "tree-hash-traits.h"
#include "gimple-iterator.h"
#include "gimplify.h"
+#include "gimple-fold.h"
#include "stor-layout.h"
#include "timevar.h"
#include "tree-cfg.h"
namespace {
/* Struct recording one operand for the store, which is either a constant,
- then VAL represents the constant and all the other fields are zero,
- or a memory load, then VAL represents the reference, BASE_ADDR is non-NULL
- and the other fields also reflect the memory load. */
+ then VAL represents the constant and all the other fields are zero, or
+ a memory load, then VAL represents the reference, BASE_ADDR is non-NULL
+ and the other fields also reflect the memory load, or an SSA name, then
+ VAL represents the SSA name and all the other fields are zero, */
struct store_operand_info
{
unsigned HOST_WIDE_INT bitregion_end;
gimple *stmt;
unsigned int order;
- /* INTEGER_CST for constant stores, MEM_REF for memory copy or
- BIT_*_EXPR for logical bitwise operation.
+ /* INTEGER_CST for constant stores, MEM_REF for memory copy,
+ BIT_*_EXPR for logical bitwise operation, BIT_INSERT_EXPR
+ for bit insertion.
LROTATE_EXPR if it can be only bswap optimized and
ops are not really meaningful.
NOP_EXPR if bswap optimization detected identity, ops
gimple *first_stmt;
unsigned char *val;
unsigned char *mask;
+ bool bit_insertion;
merged_store_group (store_immediate_info *);
~merged_store_group ();
return;
for (unsigned int i = 0; i < len; i++)
- fprintf (fd, "%x ", ptr[i]);
+ fprintf (fd, "%02x ", ptr[i]);
fprintf (fd, "\n");
}
width has been finalized. */
val = NULL;
mask = NULL;
+ bit_insertion = false;
unsigned HOST_WIDE_INT align_bitpos = 0;
get_object_alignment_1 (gimple_assign_lhs (info->stmt),
&align, &align_bitpos);
stores.qsort (sort_by_order);
store_immediate_info *info;
unsigned int i;
- /* Create a buffer of a size that is 2 times the number of bytes we're
- storing. That way native_encode_expr can write power-of-2-sized
- chunks without overrunning. */
- buf_size = 2 * ((bitregion_end - bitregion_start) / BITS_PER_UNIT);
+ /* Create a power-of-2-sized buffer for native_encode_expr. */
+ buf_size = 1 << ceil_log2 ((bitregion_end - bitregion_start) / BITS_PER_UNIT);
val = XNEWVEC (unsigned char, 2 * buf_size);
mask = val + buf_size;
memset (val, 0, buf_size);
FOR_EACH_VEC_ELT (stores, i, info)
{
unsigned int pos_in_buffer = info->bitpos - bitregion_start;
- tree cst = NULL_TREE;
+ tree cst;
if (info->ops[0].val && info->ops[0].base_addr == NULL_TREE)
cst = info->ops[0].val;
else if (info->ops[1].val && info->ops[1].base_addr == NULL_TREE)
cst = info->ops[1].val;
+ else
+ cst = NULL_TREE;
bool ret = true;
if (cst)
- ret = encode_tree_to_bitpos (cst, val, info->bitsize,
- pos_in_buffer, buf_size);
+ {
+ if (info->rhs_code == BIT_INSERT_EXPR)
+ bit_insertion = true;
+ else
+ ret = encode_tree_to_bitpos (cst, val, info->bitsize,
+ pos_in_buffer, buf_size);
+ }
+ unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
+ if (BYTES_BIG_ENDIAN)
+ clear_bit_region_be (m, (BITS_PER_UNIT - 1
+ - (pos_in_buffer % BITS_PER_UNIT)),
+ info->bitsize);
+ else
+ clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
if (cst && dump_file && (dump_flags & TDF_DETAILS))
{
if (ret)
{
- fprintf (dump_file, "After writing ");
+ fputs ("After writing ", dump_file);
print_generic_expr (dump_file, cst, 0);
fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC
- " at position %d the merged region contains:\n",
- info->bitsize, pos_in_buffer);
+ " at position %d\n", info->bitsize, pos_in_buffer);
+ fputs (" the merged value contains ", dump_file);
dump_char_array (dump_file, val, buf_size);
+ fputs (" the merged mask contains ", dump_file);
+ dump_char_array (dump_file, mask, buf_size);
+ if (bit_insertion)
+ fputs (" bit insertion is required\n", dump_file);
}
else
fprintf (dump_file, "Failed to merge stores\n");
}
if (!ret)
return false;
- unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
- if (BYTES_BIG_ENDIAN)
- clear_bit_region_be (m, (BITS_PER_UNIT - 1
- - (pos_in_buffer % BITS_PER_UNIT)),
- info->bitsize);
- else
- clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
}
stores.qsort (sort_by_bitpos);
return true;
{
}
- /* Pass not supported for PDP-endianness, nor for insane hosts
- or target character sizes where native_{encode,interpret}_expr
+ /* Pass not supported for PDP-endian, nor for insane hosts or
+ target character sizes where native_{encode,interpret}_expr
doesn't work properly. */
virtual bool
gate (function *)
{
return flag_store_merging
- && WORDS_BIG_ENDIAN == BYTES_BIG_ENDIAN
+ && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN
&& CHAR_BIT == 8
&& BITS_PER_UNIT == 8;
}
return false;
if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "Attempting to coalesce %u stores in chain.\n",
+ fprintf (dump_file, "Attempting to coalesce %u stores in chain\n",
m_store_info.length ());
store_immediate_info *info;
info = m_store_info[0];
merged_store_group *merged_store = new merged_store_group (info);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fputs ("New store group\n", dump_file);
FOR_EACH_VEC_ELT (m_store_info, i, info)
{
- if (dump_file && (dump_flags & TDF_DETAILS))
- {
- fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC
- " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:\n",
- i, info->bitsize, info->bitpos);
- print_generic_expr (dump_file, gimple_assign_rhs1 (info->stmt));
- fprintf (dump_file, "\n------------\n");
- }
-
if (i <= ignore)
- continue;
+ goto done;
/* First try to handle group of stores like:
p[0] = data >> 24;
merged_store = new merged_store_group (m_store_info[ignore]);
else
merged_store = NULL;
- continue;
+ goto done;
}
}
&& merged_store->stores[0]->rhs_code == INTEGER_CST)
{
merged_store->merge_overlapping (info);
- continue;
+ goto done;
}
}
/* |---store 1---||---store 2---|
This store is consecutive to the previous one.
Merge it into the current store group. There can be gaps in between
the stores, but there can't be gaps in between bitregions. */
- else if (info->rhs_code != LROTATE_EXPR
- && info->bitregion_start <= merged_store->bitregion_end
- && info->rhs_code == merged_store->stores[0]->rhs_code)
+ else if (info->bitregion_start <= merged_store->bitregion_end
+ && info->rhs_code != LROTATE_EXPR
+ && (info->rhs_code == merged_store->stores[0]->rhs_code
+ || (info->rhs_code == INTEGER_CST
+ && merged_store->stores[0]->rhs_code == BIT_INSERT_EXPR)
+ || (info->rhs_code == BIT_INSERT_EXPR
+ && merged_store->stores[0]->rhs_code == INTEGER_CST)))
{
store_immediate_info *infof = merged_store->stores[0];
info->bitpos + info->bitsize)))
{
merged_store->merge_into (info);
- continue;
+ goto done;
}
}
/* Try to apply all the stores recorded for the group to determine
the bitpattern they write and discard it if that fails.
This will also reject single-store groups. */
- if (!merged_store->apply_stores ())
- delete merged_store;
- else
+ if (merged_store->apply_stores ())
m_merged_store_groups.safe_push (merged_store);
+ else
+ delete merged_store;
merged_store = new merged_store_group (info);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fputs ("New store group\n", dump_file);
+
+ done:
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC
+ " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:",
+ i, info->bitsize, info->bitpos);
+ print_generic_expr (dump_file, gimple_assign_rhs1 (info->stmt));
+ fputc ('\n', dump_file);
+ }
}
/* Record or discard the last store group. */
if (merged_store)
{
- if (!merged_store->apply_stores ())
- delete merged_store;
- else
+ if (merged_store->apply_stores ())
m_merged_store_groups.safe_push (merged_store);
+ else
+ delete merged_store;
}
gcc_assert (m_merged_store_groups.length () <= m_store_info.length ());
+
bool success
= !m_merged_store_groups.is_empty ()
&& m_merged_store_groups.length () < m_store_info.length ();
if (success && dump_file)
- fprintf (dump_file, "Coalescing successful!\n"
- "Merged into %u stores\n",
+ fprintf (dump_file, "Coalescing successful!\nMerged into %u stores\n",
m_merged_store_groups.length ());
return success;
return 1;
}
return 0;
+ case BIT_INSERT_EXPR:
+ return has_single_use (gimple_assign_rhs1 (stmt)) ? 0 : 1;
default:
gcc_unreachable ();
}
{
unsigned HOST_WIDE_INT try_size = split_store->size;
unsigned HOST_WIDE_INT try_pos = split_store->bytepos;
+ unsigned HOST_WIDE_INT try_bitpos = try_pos * BITS_PER_UNIT;
unsigned HOST_WIDE_INT align = split_store->align;
tree dest, src;
location_t loc;
MR_DEPENDENCE_BASE (dest) = base;
}
- tree mask = integer_zero_node;
- if (!bswap_res)
+ tree mask;
+ if (bswap_res)
+ mask = integer_zero_node;
+ else
mask = native_interpret_expr (int_type,
group->mask + try_pos
- start_byte_pos,
unsigned HOST_WIDE_INT load_align = group->load_align[j];
unsigned HOST_WIDE_INT align_bitpos
- = known_alignment (try_pos * BITS_PER_UNIT
+ = known_alignment (try_bitpos
- split_store->orig_stores[0]->bitpos
+ op.bitpos);
if (align_bitpos & (load_align - 1))
= build_aligned_type (load_int_type, load_align);
poly_uint64 load_pos
- = exact_div (try_pos * BITS_PER_UNIT
+ = exact_div (try_bitpos
- split_store->orig_stores[0]->bitpos
+ op.bitpos,
BITS_PER_UNIT);
break;
}
+ /* If bit insertion is required, we use the source as an accumulator
+ into which the successive bit-field values are manually inserted.
+ FIXME: perhaps use BIT_INSERT_EXPR instead in some cases? */
+ if (group->bit_insertion)
+ FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
+ if (info->rhs_code == BIT_INSERT_EXPR
+ && info->bitpos < try_bitpos + try_size
+ && info->bitpos + info->bitsize > try_bitpos)
+ {
+ /* Mask, truncate, convert to final type, shift and ior into
+ the accumulator. Note that every step can be a no-op. */
+ const HOST_WIDE_INT start_gap = info->bitpos - try_bitpos;
+ const HOST_WIDE_INT end_gap
+ = (try_bitpos + try_size) - (info->bitpos + info->bitsize);
+ tree tem = info->ops[0].val;
+ if ((BYTES_BIG_ENDIAN ? start_gap : end_gap) > 0)
+ {
+ const unsigned HOST_WIDE_INT imask
+ = (HOST_WIDE_INT_1U << info->bitsize) - 1;
+ tem = gimple_build (&seq, loc,
+ BIT_AND_EXPR, TREE_TYPE (tem), tem,
+ build_int_cst (TREE_TYPE (tem),
+ imask));
+ }
+ const HOST_WIDE_INT shift
+ = (BYTES_BIG_ENDIAN ? end_gap : start_gap);
+ if (shift < 0)
+ tem = gimple_build (&seq, loc,
+ RSHIFT_EXPR, TREE_TYPE (tem), tem,
+ build_int_cst (NULL_TREE, -shift));
+ tem = gimple_convert (&seq, loc, int_type, tem);
+ if (shift > 0)
+ tem = gimple_build (&seq, loc,
+ LSHIFT_EXPR, int_type, tem,
+ build_int_cst (NULL_TREE, shift));
+ src = gimple_build (&seq, loc,
+ BIT_IOR_EXPR, int_type, tem, src);
+ }
+
if (!integer_zerop (mask))
{
tree tem = make_ssa_name (int_type);
if (dump_file)
{
fprintf (dump_file,
- "New sequence of %u stmts to replace old one of %u stmts\n",
+ "New sequence of %u stores to replace old one of %u stores\n",
split_stores.length (), orig_num_stmts);
if (dump_flags & TDF_DETAILS)
print_gimple_seq (dump_file, seq, 0, TDF_VOPS | TDF_MEMSYMS);
def_stmt = SSA_NAME_DEF_STMT (rhs1);
}
}
+
if (rhs_code == ERROR_MARK && !invalid)
switch ((rhs_code = gimple_assign_rhs_code (def_stmt)))
{
invalid = true;
break;
}
+
unsigned HOST_WIDE_INT const_bitsize;
if (bitsize.is_constant (&const_bitsize)
- && multiple_p (const_bitsize, BITS_PER_UNIT)
- && multiple_p (bitpos, BITS_PER_UNIT)
+ && (const_bitsize % BITS_PER_UNIT) == 0
&& const_bitsize <= 64
- && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
+ && multiple_p (bitpos, BITS_PER_UNIT))
{
ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12);
if (ins_stmt)
}
}
}
+
+ if (invalid
+ && bitsize.is_constant (&const_bitsize)
+ && ((const_bitsize % BITS_PER_UNIT) != 0
+ || !multiple_p (bitpos, BITS_PER_UNIT))
+ && const_bitsize <= 64)
+ {
+ /* Bypass a truncating conversion to the bit-field type. */
+ if (is_gimple_assign (def_stmt) && CONVERT_EXPR_CODE_P (rhs_code))
+ {
+ tree rhs1 = gimple_assign_rhs1 (def_stmt);
+ if (TREE_CODE (rhs1) == SSA_NAME
+ && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
+ && const_bitsize <= TYPE_PRECISION (TREE_TYPE (rhs1)))
+ rhs = rhs1;
+ }
+ rhs_code = BIT_INSERT_EXPR;
+ ops[0].val = rhs;
+ ops[0].base_addr = NULL_TREE;
+ ops[1].base_addr = NULL_TREE;
+ invalid = false;
+ }
}
unsigned HOST_WIDE_INT const_bitsize, const_bitpos;