From 0a49080f797018268c9bcc28806cd6242d5c7478 Mon Sep 17 00:00:00 2001 From: Antoni Adaszkiewicz Date: Mon, 29 Aug 2022 14:19:06 +0200 Subject: [PATCH] CreatePatch.py: Add support for hardlinks during delta generation. Change-Id: Ie3fd13d9557f36222d96c61db1ce4b54cb5e2d82 --- mk_delta/common/bin/CreatePatch.py | 190 ++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 76 deletions(-) diff --git a/mk_delta/common/bin/CreatePatch.py b/mk_delta/common/bin/CreatePatch.py index 43e7f14..99399df 100755 --- a/mk_delta/common/bin/CreatePatch.py +++ b/mk_delta/common/bin/CreatePatch.py @@ -82,6 +82,7 @@ NEW_FILES_ZIP_NAME = "system.7z" SYMLINK_TYPE = "SYM" ATTR_DOC_EXT = "_attr.txt" SYMLINK_DOC_NAME = "_sym.txt" +HARDLINK_DOC_NAME = "_hard.txt" PART_DOC_EXT = ".txt" DIFF_PREFIX = "diff" DIFF_SUFFIX = ".delta" @@ -330,7 +331,7 @@ def Diff_AttrFiles(ATTR_OLD, ATTR_NEW, ATTR_FILE): file_out.write(line + '\n') -def Update_Attr(RequestedPath, Type, File_Attibutes, Sym_Attibutes): +def Update_Attr(RequestedPath, Type, File_Attributes, Sym_Attributes): # Full File Path should MATCH if GenerateDiffAttr == "FALSE": return @@ -340,9 +341,9 @@ def Update_Attr(RequestedPath, Type, File_Attibutes, Sym_Attibutes): for line in f: if FilePath in line: if Type == SYMLINK_TYPE: - Sym_Attibutes.append(line) + Sym_Attributes.append(line) else: - File_Attibutes.append(line) + File_Attributes.append(line) def hash_file(filename): @@ -364,50 +365,14 @@ def hash_file(filename): return h.hexdigest() -def find_dupes_dir(BASE_OLD, BASE_NEW): - dups = {} - fdupes = {} - print('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW)) - logging.info('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW)) - for rootbase, subdirsB, fileListB in os.walk(BASE_OLD): - #print('Scanning %s...' % rootbase) - for filename in fileListB: - path = os.path.join(rootbase, filename) - if os.path.islink(path): - continue - # Calculate hash - file_hash = hash_file(path) - dups[file_hash] = path - - for roottarget, subdirsT, fileListT in os.walk(BASE_NEW): - #print('Scanning %s...' % roottarget) - for filename in fileListT: - # Get the path to the file - path = os.path.join(roottarget, filename) - if os.path.islink(path): - continue - # Calculate hash - file_hash = hash_file(path) - # Add or append the file path - if file_hash in dups: - BaseStr = dups.get(file_hash) - Baseloc = path.find('/') - TarLoc = BaseStr.find('/') - if not path[Baseloc:] == BaseStr[TarLoc:]: - logging.info('Dupes - %s ==> %s' % (path[Baseloc:], BaseStr[TarLoc:])) - fdupes[path] = BaseStr - logging.info('Total Duplicate files %d' % (len(fdupes))) - return fdupes - - -def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT): +def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT, Old_hardlinks, New_hardlinks): dups = {} fdupes = {} print('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW)) for filename in fileListB: Src_File = BASE_OLD + '/' + filename - if os.path.islink(Src_File) or os.path.isdir(Src_File): + if os.path.islink(Src_File) or os.path.isdir(Src_File) or ishardlink(Src_File): continue # Calculate hash file_hash = hash_file(Src_File) @@ -415,7 +380,7 @@ def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT): for filename in fileListT: Dest_File = BASE_NEW + '/' + filename - if os.path.islink(Dest_File) or os.path.isdir(Dest_File): + if os.path.islink(Dest_File) or os.path.isdir(Dest_File) or ishardlink(Dest_File): continue # Calculate hash file_hash = hash_file(Dest_File) @@ -425,7 +390,6 @@ def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT): if not BaseStr[Baseloc:] == filename: #print('Dupes - %s ==> %s' % (BaseStr[Baseloc:], filename)) fdupes[BaseStr] = filename - logging.info('Total Duplicate files %d' % (len(fdupes))) return fdupes @@ -456,14 +420,15 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi files_changed = [] files_unchanged = [] files_renamed = [] - File_Attibutes = [] - Sym_Attibutes = [] + File_Attributes = [] + Sym_Attributes = [] files_Del_List = {} files_New_List = {} - MyDict_Patches = {} - PWD = os.getcwd() + # Get dictionaries used for hardlinks form both directories + New_hardlinks = get_hardlinks(BASE_NEW) + Old_hardlinks = get_hardlinks(BASE_OLD) # Generate NEW List for elt in New_files: @@ -498,14 +463,20 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi dst_file = BASE_NEW + '/' + elt #print('Files Changed - %s -%s' % (src_file,dst_file)) if os.path.islink(src_file) and os.path.islink(dst_file): - if not os.readlink(src_file) == os.readlink(dst_file): + if not (os.readlink(src_file) == os.readlink(dst_file)): files_changed.append(elt) #print('%d Sym link files changed' % len(files_changed)) logging.info('Sym links Changed - %s' % elt) else: files_unchanged.append(elt) - # Both are Normal files and they differ. (Is file returns true in case of symlink also, so additional check to find either of the file is symlink) - elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) and os.path.isfile(src_file) and os.path.isfile(dst_file): + # Both are hardlinks - we add them because we can't be sure if file they point to changes + elif elt in New_hardlinks and elt in Old_hardlinks: + files_changed.append(elt) + # Both are Normal files and they differ. (Is file returns true in case of sym/hardlink also, + # so additional check to find either of the file is sym/hardlink) + elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) \ + and (not (elt in New_hardlinks or elt in Old_hardlinks)) \ + and os.path.isfile(src_file) and os.path.isfile(dst_file): if not filecmp.cmp(src_file, dst_file): files_changed.append(elt) #print('%d Normal files changed' % len(files_changed)) @@ -561,9 +532,10 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi # this file is the same name in both! src_file = BASE_OLD + '/' + value dst_file = BASE_NEW + '/' + files_New_List[key] - olddirpath = path_head(files_New_List[key]) - newdirpath = path_head(value) - if os.path.islink(src_file) or os.path.islink(dst_file): + # we don't want to move hardlinks + if ishardlink(src_file) or ishardlink(dst_file): + logging.debug('Cannot diff as one of them is a hardlink') + elif os.path.islink(src_file) or os.path.islink(dst_file): logging.debug('Cannot diff as one of them is Symlink') elif os.path.isdir(src_file) or os.path.isdir(dst_file): logging.debug('Cannot diff as one of them is dir') @@ -580,14 +552,18 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi ''' Sym_Diff_Cnt = 0 Sym_New_Cnt = 0 + Hard_Diff_Cnt = 0 + Hard_New_Cnt = 0 Del_Cnt = 0 New_Cnt = 0 Diff_Cnt = 0 Move_Cnt = 0 Verbatim_Cnt = 0 SymLinkDoc = OUT_DIR + '/' + PART_NAME + SYMLINK_DOC_NAME + HardLinkDoc = OUT_DIR + '/' + PART_NAME + HARDLINK_DOC_NAME Partition_Doc = open(OUT_DIR + '/' + PART_NAME + '.txt', 'w') Partition_Doc_SymLinks = open(SymLinkDoc, 'w') + Partition_Doc_HardLinks = open(HardLinkDoc, "w") print("writing diff'ed changed files...") for elt in files_changed: @@ -600,9 +576,18 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi patch = os.readlink(dst_file) Sym_Diff_Cnt = Sym_Diff_Cnt + 1 Partition_Doc_SymLinks.write('SYM:DIFF:%s:%s:%s\n' % (elt, elt, patch)) - Update_Attr(elt, "SYM", File_Attibutes, Sym_Attibutes) + Update_Attr(elt, "SYM", File_Attributes, Sym_Attributes) + # Both are hardlinks and they differ (point to something different, new/changed file) + if elt in Old_hardlinks and elt in New_hardlinks: + if Old_hardlinks[elt] != New_hardlinks[elt] or New_hardlinks[elt] in files_changed or New_hardlinks[elt] in files_new: + logging.debug('Hardlinks changed %s %s' % (src_file, dst_file)) + patch = New_hardlinks[elt] + Hard_Diff_Cnt += 1 + Partition_Doc_HardLinks.write('HARD:DIFF:%s:%s:%s\n' % (elt, elt, patch)) # Both are NORMAL files and they differ - elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) and os.path.isfile(dst_file) and os.path.isfile(src_file): + elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) \ + and (not (elt in Old_hardlinks or elt in New_hardlinks)) \ + and os.path.isfile(dst_file) and os.path.isfile(src_file): # Both are files and they differ Diff_Cnt = Diff_Cnt + 1 patchName = (DIFF_PREFIX + '%d_%s_' + PART_NAME + DIFF_SUFFIX) % (Diff_Cnt, path_leaf(elt)) @@ -619,17 +604,16 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi else: Partition_Doc.write('DIFF:REG:%s:%s:%s:%s:%s\n' % (elt, elt, hash_file(src_file), hash_file(dst_file), patchName)) - Update_Attr(elt, "FILE", File_Attibutes, Sym_Attibutes) + Update_Attr(elt, "FILE", File_Attributes, Sym_Attributes) # Both differ but they are of diff types else: # Processing and updating partition txt file will be done under REMOVED case and NEW files case accordingly, we just make an entry here files_removed.append(elt) files_new.append(elt) - fdupes = find_dupes_list(BASE_OLD, BASE_NEW, files_removed, files_new) + fdupes = find_dupes_list(BASE_OLD, BASE_NEW, files_removed, files_new, Old_hardlinks, New_hardlinks) for oldpath, newpath in fdupes.iteritems(): logging.info('Dupes %s -> %s' % (oldpath, newpath)) - for elt in files_removed: src_file = BASE_OLD + '/' + elt # If parent directory is deleted.. & del end not possible. (==> Moves should be done before deletes in ENGINE) @@ -640,7 +624,6 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi Partition_Doc.write('MOVE:REG:%s:%s:%s\n' % (elt, fdupes[src_file], hash_file(src_file))) files_removed.remove(elt) files_new.remove(fdupes[src_file]) - # Should be placed after removing duplicates, else they will be filtered here. # loop shd b for all NEW files, rather than for all delete files (Current understanding) # First Step: Sort & Filter out unwanted files @@ -650,7 +633,6 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi # 3. File name length shd b greater than 3 char # 4. As we are using sorting on file names, once file name does not match and R_Flag is set to true, we nee not check remaining files. So, will execute break. # 5. Should consider editdistance for RENAME LOGIC ==> TBD - Base_DelList = files_removed[:] Base_NewList = files_new[:] DelList = sorted(Base_DelList, key=path_leaf) @@ -663,6 +645,8 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi for file in DelList: if os.path.islink(BASE_OLD + '/' + file): continue + elif ishardlink(BASE_OLD + '/' + file): + continue elif os.path.isdir(BASE_OLD + '/' + file): continue else: @@ -674,6 +658,8 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi for file in NewList: if os.path.islink(BASE_NEW + '/' + file): continue + elif ishardlink(BASE_NEW + '/' + file): + continue elif os.path.isdir(BASE_NEW + '/' + file): continue elif len(path_leaf(file)) <= 3: @@ -681,9 +667,7 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi continue else: Filter2.append(file) - NewList = Filter2 - logging.debug('Rename Logic After filter: Delcount -%d NewCount -%d' % (len(DelList), len(NewList))) for new_file in NewList: @@ -713,7 +697,7 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi winning_patch_sz = DiffSize winning_file = del_file elif (not FileNameOld.startswith(FileNameNew[:len(FileNameNew) * 7 / 10]) and R_Flag == 'TRUE'): - logging.debug('Becuase nex set of files will not have matching name - break @@ %s %s' % (del_file, new_file)) + logging.debug('Because nex set of files will not have matching name - break @@ %s %s' % (del_file, new_file)) break if len(winning_file) > 0: logging.debug('Best Pick -%s ==> %s [%d]' % (winning_file, new_file, DiffSize)) @@ -740,8 +724,11 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi if os.path.isdir(src_file) or os.path.isdir(dst_file): # This case never occurs?? Partition_Doc.write('"%s" and "%s" renamed 0 0\n' % (elt[0], elt[1])) - Update_Attr(elt[0], "FILE", File_Attibutes, Sym_Attibutes) - else: # Make sure these files are PROPER and they shd NOT be symlinks + Update_Attr(elt[0], "FILE", File_Attributes, Sym_Attributes) + # Make sure these files are PROPER and they shd NOT be symlinks + elif not (os.path.islink(src_file) or os.path.islink(dst_file)) \ + and not (elt[0] in New_hardlinks or elt[1] in Old_hardlinks) \ + and (os.path.isfile(src_file) and os.path.isfile(dst_file)): if filecmp.cmp(src_file, dst_file): Move_Cnt = Move_Cnt + 1 Diff_Cnt = Diff_Cnt - 1 @@ -757,7 +744,7 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi Partition_Doc.write('DIFF:REG:%s:%s:%s:%s:%s\n' % (elt[1], elt[0], hash_file(src_file), hash_file(dst_file), patchName)) SS_UpdateSize(src_file, dst_file) - Update_Attr(elt[0], "FILE", File_Attibutes, Sym_Attibutes) + Update_Attr(elt[0], "FILE", File_Attributes, Sym_Attributes) # HANDLING VERBATIM - We Process NEWs and DELETEs for Verbatim list ONLY after processing duplicates & rename functionality. # So that, the rename functionality will NOT create PATCH instead of verbatims. @@ -777,6 +764,8 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi src_file = BASE_OLD + '/' + elt if os.path.islink(src_file): Partition_Doc.write('DEL:SYM:%s\n' % (elt)) + elif elt in Old_hardlinks: + Partition_Doc.write('DEL:HARD:%s\n' % (elt)) elif os.path.isdir(src_file): # If we change to DIR TYPE, then the same token should be modified on UA also and SHA should be accordingly passed. Partition_Doc.write('DEL:REG:%s:NA\n' % (elt)) @@ -813,8 +802,17 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi if not os.path.exists(path_head(destpath)): os.makedirs(path_head(destpath)) logging.info('New SymLink - Adding missing Dir') - #Update_Attr(elt, "SYM", File_Attibutes, Sym_Attibutes) + Update_Attr(elt, "SYM", File_Attributes, Sym_Attributes) Sym_New_Cnt = Sym_New_Cnt + 1 + elif elt in New_hardlinks: + patch = New_hardlinks[elt] + logging.debug('File new hardlink %s' % elt) + Partition_Doc_HardLinks.write('HARD:NEW:%s:%s\n' %(elt, patch)) + destpath = newfiles_dest_path + elt + if not os.path.exists(path_head(destpath)): + os.makedirs(path_head(destpath)) + logging.info('New hardlink - Adding missing Dir') + Hard_New_Cnt += 1 elif os.path.isdir(dst_file): # We create just empty directory here destpath = newfiles_dest_path + elt if not os.path.exists(destpath): @@ -841,6 +839,7 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi except Exception as exc: logging.critical('Error in NEW files entry -%s -%s' % (dst_file, destpath)) raise exc + Update_Attr(elt, "FILE", File_Attributes, Sym_Attributes) for elt in Dir_Added: newfiles_dest_path = 'run/upgrade-sysroot/' @@ -873,29 +872,39 @@ def SS_Generate_Delta(PART_NAME, BASE_OLD, Old_files, Old_dirs, BASE_NEW, New_fi logging.info('%d files unchanged' % len(files_unchanged)) logging.info('%d files files_renamed' % len(files_renamed)) logging.info('%d files NEW' % len(files_new)) - logging.info('%d File attr' % len(File_Attibutes)) - logging.info('%d Sym attr' % len(Sym_Attibutes)) - logging.info('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d Verbatim -%d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Verbatim_Cnt)) - print('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d Verbatim -%d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Verbatim_Cnt)) + logging.info('%d File attr' % len(File_Attributes)) + logging.info('%d Sym attr' % len(Sym_Attributes)) + logging.info('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d HardDiffs-%d HardNews-%d Verbatim -%d\n' % \ + (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt, Verbatim_Cnt)) + print('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d HardDiffs-%d HardNews-%d Verbatim -%d\n' % \ + (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt, Verbatim_Cnt)) # There could be duplicates, TODO, can check before adding.. ATTR_FILE_D = open(ATTR_FILE, 'a+') - for elt in File_Attibutes: + for elt in File_Attributes: ATTR_FILE_D.write(elt) - for elt in Sym_Attibutes: + for elt in Sym_Attributes: ATTR_FILE_D.write(elt) ATTR_FILE_D.close() Partition_Doc_SymLinks.close() + Partition_Doc_HardLinks.close() Partition_Read_SymLinks = open(SymLinkDoc, 'r+') + Partition_Read_HardLinks = open(HardLinkDoc, 'r+') Partition_Doc.write(Partition_Read_SymLinks.read()) - Partition_Doc.write('PaTcHCoUnT:%d %d %d %d %d %d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt)) - Partition_Doc_SymLinks.close() + for line in reversed(Partition_Read_HardLinks.readlines()): + Partition_Doc.write(line) + Partition_Doc.write('PaTcHCoUnT:%d %d %d %d %d %d %d %d\n' % \ + (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt)) + Partition_Read_SymLinks.close() + Partition_Read_HardLinks.close() Partition_Doc.close() os.remove(SymLinkDoc) + os.remove(HardLinkDoc) - if Diff_Cnt + Move_Cnt + New_Cnt + Del_Cnt + Sym_Diff_Cnt + Sym_New_Cnt + Verbatim_Cnt + os.path.getsize(ATTR_FILE) == 0: + if Diff_Cnt + Move_Cnt + New_Cnt + Del_Cnt + Sym_Diff_Cnt + Sym_New_Cnt + Verbatim_Cnt + Hard_Diff_Cnt + \ + Hard_New_Cnt + os.path.getsize(ATTR_FILE) == 0: print('No Delta Generated for %s - %s' % (PART_NAME, OUT_DIR)) logging.info('No Delta Generated for %s' % PART_NAME) shutil.rmtree(OUT_DIR) @@ -920,12 +929,41 @@ def NewFiles(src, dest): def measure_two_filediffs(src, dst): patchLoc = 'temp.patch' + # TODO ensure this is excepts an error subprocess.call([DIFF_UTIL, src, dst, patchLoc]) result_size = os.path.getsize(patchLoc) os.remove(patchLoc) return result_size +def ishardlink(path): + if os.stat(path).st_nlink > 1: + return True + return False + + +def get_inode(path): + return os.stat(path).st_ino + + +def get_hardlinks(base): + hardlinks_dict = {} + inodes_dict = {} + + for root, direcotories, files in os.walk(base, topdown=True, followlinks=False): + for file in sorted(files): + file_name = os.path.join(root, file) + if not os.path.islink(file_name) and ishardlink(file_name): + inode = get_inode(file_name) + rel_path = os.path.relpath(file_name, base) + if inode not in inodes_dict: + inodes_dict[inode] = rel_path + else: + hardlinks_dict[rel_path] = inodes_dict[inode] + + return hardlinks_dict + + def Get_Files(path): all_files = [] all_dirs = [] -- 2.7.4