From 7a313fc9c9968467d9e8d8bd28c92e1f971bdedc Mon Sep 17 00:00:00 2001 From: Dodji Seketeli Date: Fri, 6 Jan 2017 00:48:24 +0100 Subject: [PATCH] Better de-duplicate classes, unions, enums in non-odr contexts When reading an ELF/DWARF binary that doesn't support the One Definition Rule[1] (aka ODR), type de-duplication is done on a per-translation unit basis only. That means that a type definition must be unique only in a given translation unit, in that case. [1]: https://en.wikipedia.org/wiki/One_Definition_Rule When handling big C binaries like the Linux Kernel, this implies a lot of type duplication still, as the same type can be re-defined in a lot of different translation units. This patch tries to de-duplicate types on a per-corpus basis, even in cases where the ODR doesn't apply. It does so by noting that if two types of the same kind and name are seen in two different translation units and yet have the same source location (are defined in the same spot) then they are the same type. The patch does this for class, union and enum types as these seem to be the kinds of types which are duplicated the most and thus consume the most memory and later take the most time to canonicalize. In a subsequent patch, we might want to try to de-duplicate typedef types too, and see if gain anything. Comparing two linux kernels shows that with this patch, we come back to a speed (and memory consumption) that is comparable to when we were considering C-based binaries as being suited to our (too aggressive) ODR-based type de-duplication algorithm. Below are the output of the comparison measured with /usr/bin/time with the aggressive ODR-based type de-duplication algorithm and with this patch. We compare the vmlinux binary coming from the package kernel-3.10.0-515 against the one from kernel-3.10.0-327.el7.x86_64. We use the kabi whitelists provided by the relevant kernel-abi-whitelists packages for these kernels to restrict the comparison to a meaningful subset of interfaces. 52.66user 0.64system 0:53.34elapsed 99%CPU (0avgtext+0avgdata 944416maxresident)k 0inputs+48outputs (0major+250750minor)pagefaults 0swaps vs: 51.04user 0.87system 0:51.91elapsed 100%CPU (0avgtext+0avgdata 972560maxresident)k 0inputs+232outputs (0major+279983minor)pagefaults 0swaps The full invocation and results are available at: http://people.redhat.com/~dseketel/kabidiff/vmlinuz-3.10.0-327.el7.x86_64--vmlinuz-3.10.0-515.el7.x86_64.diff.whitelisted.with-unions.txt vs http://people.redhat.com/~dseketel/kabidiff/vmlinuz-3.10.0-327.el7.x86_64--vmlinuz-3.10.0-515.el7.x86_64.diff.whitelisted.with-unions.with-non-odr-support.txt Note that to be able to compare the two kernels, the current tree must contain the necessary patches that make libabigail understand Linux Kernel binaries. * src/abg-dwarf-reader.cc (build_enum_type) (add_or_update_class_type, add_or_update_union_type): When the ODR is not relevant, use the location of the type to detect if two enum, class or union types of the same name actually represent the same type. Signed-off-by: Dodji Seketeli --- src/abg-dwarf-reader.cc | 139 +++++++++++++++++++++++++++++----------- 1 file changed, 103 insertions(+), 36 deletions(-) diff --git a/src/abg-dwarf-reader.cc b/src/abg-dwarf-reader.cc index 63d1142e..bfd46f8b 100644 --- a/src/abg-dwarf-reader.cc +++ b/src/abg-dwarf-reader.cc @@ -9799,16 +9799,36 @@ build_enum_type(read_context& ctxt, Dwarf_Die* die, size_t where_offset) enum_is_anonymous = true; } - bool odr_is_relevant = !enum_is_anonymous && ctxt.odr_is_relevant(); + bool use_odr = ctxt.odr_is_relevant(); + // If the type has location, then associate it to its + // representation. This way, all occurences of types with the same + // representation (name) and location can be later detected as being + // for the same type. + bool associate_die_to_repr = !enum_is_anonymous && (loc || use_odr); - if (odr_is_relevant) - if (enum_type_decl_sptr pre_existing_enum = - is_enum_type(ctxt.lookup_artifact_from_die(die, where_offset))) - { - result = pre_existing_enum; - ctxt.associate_die_to_type(die, result, where_offset); - return result; - } + if (!enum_is_anonymous) + { + if (use_odr) + { + if (enum_type_decl_sptr pre_existing_enum = + is_enum_type(ctxt.lookup_artifact_from_die(die, where_offset))) + result = pre_existing_enum; + } + else if (loc) + { + if (enum_type_decl_sptr pre_existing_enum = + is_enum_type(ctxt.lookup_artifact_from_die(die, where_offset))) + if (pre_existing_enum->get_location() == loc) + result = pre_existing_enum; + } + + if (result) + { + ctxt.associate_die_to_type(die, result, where_offset, + /*associate_die_to_repr=*/false); + return result; + } + } uint64_t size = 0; if (die_unsigned_constant_attribute(die, DW_AT_byte_size, size)) @@ -9861,7 +9881,7 @@ build_enum_type(read_context& ctxt, Dwarf_Die* die, size_t where_offset) assert(t); result.reset(new enum_type_decl(name, loc, t, enms, linkage_name)); result->set_is_anonymous(enum_is_anonymous); - ctxt.associate_die_to_type(die, result, where_offset); + ctxt.associate_die_to_type(die, result, where_offset, associate_die_to_repr); return result; } @@ -10249,18 +10269,43 @@ add_or_update_class_type(read_context& ctxt, } bool use_odr = ctxt.odr_is_relevant(); - bool do_associate_by_repr = !is_anonymous && use_odr; - - if (do_associate_by_repr) - // So we can rely on the One Definition Rule to say that if - // several different named classes have the same name across - // the entire binary, then they ought to designate the same - // type. So let's ensure that if we've already seen a class - // with the same name as the name of 'die', then it's the - // same type as the one denoted by 'die'. - if (class_decl_sptr pre_existing_class = - is_class_type(ctxt.lookup_artifact_from_die(die, where_offset))) - klass = pre_existing_class; + + // If the type has location, then associate it to its + // representation. This way, all occurences of types with the same + // representation (name) and location can be later detected as being + // for the same type. + bool associate_die_to_repr = !is_anonymous && (loc || use_odr); + + if (!is_anonymous) + { + if (use_odr) + { + // So we can rely on the One Definition Rule to say that if + // several different named class types have the same + // representation (name) across the entire binary, then they + // ought to designate the same type. So let's ensure that + // if we've already seen a class with the same name as the + // name of 'die', then it's the same type as the one denoted + // by 'die'. + if (class_decl_sptr pre_existing_class = + is_class_type(ctxt.lookup_artifact_from_die(die, where_offset))) + klass = pre_existing_class; + } + else if (loc) + { + // The current type we are looking at does have a location. + // So, even if we can't assume ODR, we can assume that this + // class type we are looking at is the same as other class + // types with the same representation and the same location. + if (class_decl_sptr pre_existing_class = + is_class_type(ctxt.lookup_artifact_from_die(die, where_offset))) + if (class_decl_sptr cl = + look_through_decl_only_class(pre_existing_class)) + if (!cl->get_definition_of_declaration() + || cl->get_location() == loc) + klass = pre_existing_class; + } + } uint64_t size = 0; die_size_in_bits(die, size); @@ -10297,7 +10342,7 @@ add_or_update_class_type(read_context& ctxt, result->set_is_declaration_only(false); } - ctxt.associate_die_to_type(die, result, where_offset, do_associate_by_repr); + ctxt.associate_die_to_type(die, result, where_offset, associate_die_to_repr); ctxt.maybe_schedule_declaration_only_class_for_resolution(result); @@ -10539,18 +10584,40 @@ add_or_update_union_type(read_context& ctxt, } bool use_odr = ctxt.odr_is_relevant(); - bool do_associate_by_repr = !is_anonymous && use_odr; - - if (do_associate_by_repr) - // So we can rely on the One Definition Rule to say that if - // several different named unions have the same name across the - // entire binary, then they ought to designate the same type. - // So let's ensure that if we've already seen a union with the - // same name as the name of 'die', then it's the same type as - // the one denoted by 'die'. - if (union_decl_sptr pre_existing_union = - is_union_type(ctxt.lookup_artifact_from_die(die, where_offset))) - union_type = pre_existing_union; + + // If the type has location, then associate it to its + // representation. This way, all occurences of types with the same + // representation (name) and location can be later detected as being + // for the same type. + bool associate_die_to_repr = !is_anonymous && (loc || use_odr); + + if (!is_anonymous) + { + if (use_odr) + { + // So we can rely on the One Definition Rule to say that if + // several different named union types have the same + // representation (name) across the entire binary, then they + // ought to designate the same type. So let's ensure that + // if we've already seen a union with the same name as the + // name of 'die', then it's the same type as the one denoted + // by 'die'. + if (union_decl_sptr pre_existing_union = + is_union_type(ctxt.lookup_artifact_from_die(die, where_offset))) + union_type = pre_existing_union; + } + else if (loc) + { + // The current type we are looking at does have a location. + // So, even if we can't assume ODR, we can assume that this + // union type we are looking at is the same as other union + // types with the same representation and the same location. + if (union_decl_sptr pre_existing_union = + is_union_type(ctxt.lookup_artifact_from_die(die, where_offset))) + if (pre_existing_union->get_location() == loc) + union_type = pre_existing_union; + } + } uint64_t size = 0; die_size_in_bits(die, size); @@ -10579,7 +10646,7 @@ add_or_update_union_type(read_context& ctxt, result->set_is_declaration_only(false); } - ctxt.associate_die_to_type(die, result, where_offset, do_associate_by_repr); + ctxt.associate_die_to_type(die, result, where_offset, associate_die_to_repr); // TODO: maybe schedule declaration-only union for result like we do // for classes: -- 2.34.1