SYMLINK_TYPE = "SYM"
ATTR_DOC_EXT = "_attr.txt"
SYMLINK_DOC_NAME = "_sym.txt"
+HARDLINK_DOC_NAME = "_hard.txt"
PART_DOC_EXT = ".txt"
DIFF_PREFIX = "diff"
DIFF_SUFFIX = ".delta"
file_out.write(line + '\n')
-def Update_Attr(RequestedPath, Type, File_Attibutes, Sym_Attibutes):
+def Update_Attr(RequestedPath, Type, File_Attributes, Sym_Attributes):
# Full File Path should MATCH
if GenerateDiffAttr == "FALSE":
return
for line in f:
if FilePath in line:
if Type == SYMLINK_TYPE:
- Sym_Attibutes.append(line)
+ Sym_Attributes.append(line)
else:
- File_Attibutes.append(line)
+ File_Attributes.append(line)
def hash_file(filename):
return h.hexdigest()
-def find_dupes_dir(BASE_OLD, BASE_NEW):
- dups = {}
- fdupes = {}
- print('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW))
- logging.info('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW))
- for rootbase, subdirsB, fileListB in os.walk(BASE_OLD):
- #print('Scanning %s...' % rootbase)
- for filename in fileListB:
- path = os.path.join(rootbase, filename)
- if os.path.islink(path):
- continue
- # Calculate hash
- file_hash = hash_file(path)
- dups[file_hash] = path
-
- for roottarget, subdirsT, fileListT in os.walk(BASE_NEW):
- #print('Scanning %s...' % roottarget)
- for filename in fileListT:
- # Get the path to the file
- path = os.path.join(roottarget, filename)
- if os.path.islink(path):
- continue
- # Calculate hash
- file_hash = hash_file(path)
- # Add or append the file path
- if file_hash in dups:
- BaseStr = dups.get(file_hash)
- Baseloc = path.find('/')
- TarLoc = BaseStr.find('/')
- if not path[Baseloc:] == BaseStr[TarLoc:]:
- logging.info('Dupes - %s ==> %s' % (path[Baseloc:], BaseStr[TarLoc:]))
- fdupes[path] = BaseStr
- logging.info('Total Duplicate files %d' % (len(fdupes)))
- return fdupes
-
-
-def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT):
+def find_dupes_list(BASE_OLD, BASE_NEW, fileListB, fileListT, Old_hardlinks, New_hardlinks):
dups = {}
fdupes = {}
print('Finding Duplicates in - %s %s' % (BASE_OLD, BASE_NEW))
for filename in fileListB:
Src_File = BASE_OLD + '/' + filename
- if os.path.islink(Src_File) or os.path.isdir(Src_File):
+ if os.path.islink(Src_File) or os.path.isdir(Src_File) or ishardlink(Src_File):
continue
# Calculate hash
file_hash = hash_file(Src_File)
for filename in fileListT:
Dest_File = BASE_NEW + '/' + filename
- if os.path.islink(Dest_File) or os.path.isdir(Dest_File):
+ if os.path.islink(Dest_File) or os.path.isdir(Dest_File) or ishardlink(Dest_File):
continue
# Calculate hash
file_hash = hash_file(Dest_File)
if not BaseStr[Baseloc:] == filename:
#print('Dupes - %s ==> %s' % (BaseStr[Baseloc:], filename))
fdupes[BaseStr] = filename
-
logging.info('Total Duplicate files %d' % (len(fdupes)))
return fdupes
files_changed = []
files_unchanged = []
files_renamed = []
- File_Attibutes = []
- Sym_Attibutes = []
+ File_Attributes = []
+ Sym_Attributes = []
files_Del_List = {}
files_New_List = {}
- MyDict_Patches = {}
- PWD = os.getcwd()
+ # Get dictionaries used for hardlinks form both directories
+ New_hardlinks = get_hardlinks(BASE_NEW)
+ Old_hardlinks = get_hardlinks(BASE_OLD)
# Generate NEW List
for elt in New_files:
dst_file = BASE_NEW + '/' + elt
#print('Files Changed - %s -%s' % (src_file,dst_file))
if os.path.islink(src_file) and os.path.islink(dst_file):
- if not os.readlink(src_file) == os.readlink(dst_file):
+ if not (os.readlink(src_file) == os.readlink(dst_file)):
files_changed.append(elt)
#print('%d Sym link files changed' % len(files_changed))
logging.info('Sym links Changed - %s' % elt)
else:
files_unchanged.append(elt)
- # Both are Normal files and they differ. (Is file returns true in case of symlink also, so additional check to find either of the file is symlink)
- elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) and os.path.isfile(src_file) and os.path.isfile(dst_file):
+ # Both are hardlinks - we add them because we can't be sure if file they point to changes
+ elif elt in New_hardlinks and elt in Old_hardlinks:
+ files_changed.append(elt)
+ # Both are Normal files and they differ. (Is file returns true in case of sym/hardlink also,
+ # so additional check to find either of the file is sym/hardlink)
+ elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) \
+ and (not (elt in New_hardlinks or elt in Old_hardlinks)) \
+ and os.path.isfile(src_file) and os.path.isfile(dst_file):
if not filecmp.cmp(src_file, dst_file):
files_changed.append(elt)
#print('%d Normal files changed' % len(files_changed))
# this file is the same name in both!
src_file = BASE_OLD + '/' + value
dst_file = BASE_NEW + '/' + files_New_List[key]
- olddirpath = path_head(files_New_List[key])
- newdirpath = path_head(value)
- if os.path.islink(src_file) or os.path.islink(dst_file):
+ # we don't want to move hardlinks
+ if ishardlink(src_file) or ishardlink(dst_file):
+ logging.debug('Cannot diff as one of them is a hardlink')
+ elif os.path.islink(src_file) or os.path.islink(dst_file):
logging.debug('Cannot diff as one of them is Symlink')
elif os.path.isdir(src_file) or os.path.isdir(dst_file):
logging.debug('Cannot diff as one of them is dir')
'''
Sym_Diff_Cnt = 0
Sym_New_Cnt = 0
+ Hard_Diff_Cnt = 0
+ Hard_New_Cnt = 0
Del_Cnt = 0
New_Cnt = 0
Diff_Cnt = 0
Move_Cnt = 0
Verbatim_Cnt = 0
SymLinkDoc = OUT_DIR + '/' + PART_NAME + SYMLINK_DOC_NAME
+ HardLinkDoc = OUT_DIR + '/' + PART_NAME + HARDLINK_DOC_NAME
Partition_Doc = open(OUT_DIR + '/' + PART_NAME + '.txt', 'w')
Partition_Doc_SymLinks = open(SymLinkDoc, 'w')
+ Partition_Doc_HardLinks = open(HardLinkDoc, "w")
print("writing diff'ed changed files...")
for elt in files_changed:
patch = os.readlink(dst_file)
Sym_Diff_Cnt = Sym_Diff_Cnt + 1
Partition_Doc_SymLinks.write('SYM:DIFF:%s:%s:%s\n' % (elt, elt, patch))
- Update_Attr(elt, "SYM", File_Attibutes, Sym_Attibutes)
+ Update_Attr(elt, "SYM", File_Attributes, Sym_Attributes)
+ # Both are hardlinks and they differ (point to something different, new/changed file)
+ if elt in Old_hardlinks and elt in New_hardlinks:
+ if Old_hardlinks[elt] != New_hardlinks[elt] or New_hardlinks[elt] in files_changed or New_hardlinks[elt] in files_new:
+ logging.debug('Hardlinks changed %s %s' % (src_file, dst_file))
+ patch = New_hardlinks[elt]
+ Hard_Diff_Cnt += 1
+ Partition_Doc_HardLinks.write('HARD:DIFF:%s:%s:%s\n' % (elt, elt, patch))
# Both are NORMAL files and they differ
- elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) and os.path.isfile(dst_file) and os.path.isfile(src_file):
+ elif (not (os.path.islink(src_file) or os.path.islink(dst_file))) \
+ and (not (elt in Old_hardlinks or elt in New_hardlinks)) \
+ and os.path.isfile(dst_file) and os.path.isfile(src_file):
# Both are files and they differ
Diff_Cnt = Diff_Cnt + 1
patchName = (DIFF_PREFIX + '%d_%s_' + PART_NAME + DIFF_SUFFIX) % (Diff_Cnt, path_leaf(elt))
else:
Partition_Doc.write('DIFF:REG:%s:%s:%s:%s:%s\n' % (elt, elt, hash_file(src_file), hash_file(dst_file), patchName))
- Update_Attr(elt, "FILE", File_Attibutes, Sym_Attibutes)
+ Update_Attr(elt, "FILE", File_Attributes, Sym_Attributes)
# Both differ but they are of diff types
else:
# Processing and updating partition txt file will be done under REMOVED case and NEW files case accordingly, we just make an entry here
files_removed.append(elt)
files_new.append(elt)
- fdupes = find_dupes_list(BASE_OLD, BASE_NEW, files_removed, files_new)
+ fdupes = find_dupes_list(BASE_OLD, BASE_NEW, files_removed, files_new, Old_hardlinks, New_hardlinks)
for oldpath, newpath in fdupes.iteritems():
logging.info('Dupes %s -> %s' % (oldpath, newpath))
-
for elt in files_removed:
src_file = BASE_OLD + '/' + elt
# If parent directory is deleted.. & del end not possible. (==> Moves should be done before deletes in ENGINE)
Partition_Doc.write('MOVE:REG:%s:%s:%s\n' % (elt, fdupes[src_file], hash_file(src_file)))
files_removed.remove(elt)
files_new.remove(fdupes[src_file])
-
# Should be placed after removing duplicates, else they will be filtered here.
# loop shd b for all NEW files, rather than for all delete files (Current understanding)
# First Step: Sort & Filter out unwanted files
# 3. File name length shd b greater than 3 char
# 4. As we are using sorting on file names, once file name does not match and R_Flag is set to true, we nee not check remaining files. So, will execute break.
# 5. Should consider editdistance for RENAME LOGIC ==> TBD
-
Base_DelList = files_removed[:]
Base_NewList = files_new[:]
DelList = sorted(Base_DelList, key=path_leaf)
for file in DelList:
if os.path.islink(BASE_OLD + '/' + file):
continue
+ elif ishardlink(BASE_OLD + '/' + file):
+ continue
elif os.path.isdir(BASE_OLD + '/' + file):
continue
else:
for file in NewList:
if os.path.islink(BASE_NEW + '/' + file):
continue
+ elif ishardlink(BASE_NEW + '/' + file):
+ continue
elif os.path.isdir(BASE_NEW + '/' + file):
continue
elif len(path_leaf(file)) <= 3:
continue
else:
Filter2.append(file)
-
NewList = Filter2
-
logging.debug('Rename Logic After filter: Delcount -%d NewCount -%d' % (len(DelList), len(NewList)))
for new_file in NewList:
winning_patch_sz = DiffSize
winning_file = del_file
elif (not FileNameOld.startswith(FileNameNew[:len(FileNameNew) * 7 / 10]) and R_Flag == 'TRUE'):
- logging.debug('Becuase nex set of files will not have matching name - break @@ %s %s' % (del_file, new_file))
+ logging.debug('Because nex set of files will not have matching name - break @@ %s %s' % (del_file, new_file))
break
if len(winning_file) > 0:
logging.debug('Best Pick -%s ==> %s [%d]' % (winning_file, new_file, DiffSize))
if os.path.isdir(src_file) or os.path.isdir(dst_file):
# This case never occurs??
Partition_Doc.write('"%s" and "%s" renamed 0 0\n' % (elt[0], elt[1]))
- Update_Attr(elt[0], "FILE", File_Attibutes, Sym_Attibutes)
- else: # Make sure these files are PROPER and they shd NOT be symlinks
+ Update_Attr(elt[0], "FILE", File_Attributes, Sym_Attributes)
+ # Make sure these files are PROPER and they shd NOT be symlinks
+ elif not (os.path.islink(src_file) or os.path.islink(dst_file)) \
+ and not (elt[0] in New_hardlinks or elt[1] in Old_hardlinks) \
+ and (os.path.isfile(src_file) and os.path.isfile(dst_file)):
if filecmp.cmp(src_file, dst_file):
Move_Cnt = Move_Cnt + 1
Diff_Cnt = Diff_Cnt - 1
Partition_Doc.write('DIFF:REG:%s:%s:%s:%s:%s\n' % (elt[1], elt[0], hash_file(src_file), hash_file(dst_file), patchName))
SS_UpdateSize(src_file, dst_file)
- Update_Attr(elt[0], "FILE", File_Attibutes, Sym_Attibutes)
+ Update_Attr(elt[0], "FILE", File_Attributes, Sym_Attributes)
# HANDLING VERBATIM - We Process NEWs and DELETEs for Verbatim list ONLY after processing duplicates & rename functionality.
# So that, the rename functionality will NOT create PATCH instead of verbatims.
src_file = BASE_OLD + '/' + elt
if os.path.islink(src_file):
Partition_Doc.write('DEL:SYM:%s\n' % (elt))
+ elif elt in Old_hardlinks:
+ Partition_Doc.write('DEL:HARD:%s\n' % (elt))
elif os.path.isdir(src_file):
# If we change to DIR TYPE, then the same token should be modified on UA also and SHA should be accordingly passed.
Partition_Doc.write('DEL:REG:%s:NA\n' % (elt))
if not os.path.exists(path_head(destpath)):
os.makedirs(path_head(destpath))
logging.info('New SymLink - Adding missing Dir')
- #Update_Attr(elt, "SYM", File_Attibutes, Sym_Attibutes)
+ Update_Attr(elt, "SYM", File_Attributes, Sym_Attributes)
Sym_New_Cnt = Sym_New_Cnt + 1
+ elif elt in New_hardlinks:
+ patch = New_hardlinks[elt]
+ logging.debug('File new hardlink %s' % elt)
+ Partition_Doc_HardLinks.write('HARD:NEW:%s:%s\n' %(elt, patch))
+ destpath = newfiles_dest_path + elt
+ if not os.path.exists(path_head(destpath)):
+ os.makedirs(path_head(destpath))
+ logging.info('New hardlink - Adding missing Dir')
+ Hard_New_Cnt += 1
elif os.path.isdir(dst_file): # We create just empty directory here
destpath = newfiles_dest_path + elt
if not os.path.exists(destpath):
except Exception as exc:
logging.critical('Error in NEW files entry -%s -%s' % (dst_file, destpath))
raise exc
+ Update_Attr(elt, "FILE", File_Attributes, Sym_Attributes)
for elt in Dir_Added:
newfiles_dest_path = 'run/upgrade-sysroot/'
logging.info('%d files unchanged' % len(files_unchanged))
logging.info('%d files files_renamed' % len(files_renamed))
logging.info('%d files NEW' % len(files_new))
- logging.info('%d File attr' % len(File_Attibutes))
- logging.info('%d Sym attr' % len(Sym_Attibutes))
- logging.info('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d Verbatim -%d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Verbatim_Cnt))
- print('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d Verbatim -%d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Verbatim_Cnt))
+ logging.info('%d File attr' % len(File_Attributes))
+ logging.info('%d Sym attr' % len(Sym_Attributes))
+ logging.info('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d HardDiffs-%d HardNews-%d Verbatim -%d\n' % \
+ (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt, Verbatim_Cnt))
+ print('PaTcHCoUnT:Diffs-%d Moves-%d News-%d Delets-%d SymDiffs-%d SymNews-%d HardDiffs-%d HardNews-%d Verbatim -%d\n' % \
+ (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt, Verbatim_Cnt))
# There could be duplicates, TODO, can check before adding..
ATTR_FILE_D = open(ATTR_FILE, 'a+')
- for elt in File_Attibutes:
+ for elt in File_Attributes:
ATTR_FILE_D.write(elt)
- for elt in Sym_Attibutes:
+ for elt in Sym_Attributes:
ATTR_FILE_D.write(elt)
ATTR_FILE_D.close()
Partition_Doc_SymLinks.close()
+ Partition_Doc_HardLinks.close()
Partition_Read_SymLinks = open(SymLinkDoc, 'r+')
+ Partition_Read_HardLinks = open(HardLinkDoc, 'r+')
Partition_Doc.write(Partition_Read_SymLinks.read())
- Partition_Doc.write('PaTcHCoUnT:%d %d %d %d %d %d\n' % (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt))
- Partition_Doc_SymLinks.close()
+ for line in reversed(Partition_Read_HardLinks.readlines()):
+ Partition_Doc.write(line)
+ Partition_Doc.write('PaTcHCoUnT:%d %d %d %d %d %d %d %d\n' % \
+ (Diff_Cnt, Move_Cnt, New_Cnt, Del_Cnt, Sym_Diff_Cnt, Sym_New_Cnt, Hard_Diff_Cnt, Hard_New_Cnt))
+ Partition_Read_SymLinks.close()
+ Partition_Read_HardLinks.close()
Partition_Doc.close()
os.remove(SymLinkDoc)
+ os.remove(HardLinkDoc)
- if Diff_Cnt + Move_Cnt + New_Cnt + Del_Cnt + Sym_Diff_Cnt + Sym_New_Cnt + Verbatim_Cnt + os.path.getsize(ATTR_FILE) == 0:
+ if Diff_Cnt + Move_Cnt + New_Cnt + Del_Cnt + Sym_Diff_Cnt + Sym_New_Cnt + Verbatim_Cnt + Hard_Diff_Cnt + \
+ Hard_New_Cnt + os.path.getsize(ATTR_FILE) == 0:
print('No Delta Generated for %s - %s' % (PART_NAME, OUT_DIR))
logging.info('No Delta Generated for %s' % PART_NAME)
shutil.rmtree(OUT_DIR)
def measure_two_filediffs(src, dst):
patchLoc = 'temp.patch'
+ # TODO ensure this is excepts an error
subprocess.call([DIFF_UTIL, src, dst, patchLoc])
result_size = os.path.getsize(patchLoc)
os.remove(patchLoc)
return result_size
+def ishardlink(path):
+ if os.stat(path).st_nlink > 1:
+ return True
+ return False
+
+
+def get_inode(path):
+ return os.stat(path).st_ino
+
+
+def get_hardlinks(base):
+ hardlinks_dict = {}
+ inodes_dict = {}
+
+ for root, direcotories, files in os.walk(base, topdown=True, followlinks=False):
+ for file in sorted(files):
+ file_name = os.path.join(root, file)
+ if not os.path.islink(file_name) and ishardlink(file_name):
+ inode = get_inode(file_name)
+ rel_path = os.path.relpath(file_name, base)
+ if inode not in inodes_dict:
+ inodes_dict[inode] = rel_path
+ else:
+ hardlinks_dict[rel_path] = inodes_dict[inode]
+
+ return hardlinks_dict
+
+
def Get_Files(path):
all_files = []
all_dirs = []