From 5f9dedf37b411a1ed4680e6ad50e00d98b8d2dcf Mon Sep 17 00:00:00 2001 From: Paul Nasrat Date: Thu, 7 Jun 2007 08:17:46 +0000 Subject: [PATCH] This patch adds a --update option to createrepo. https://lists.dulug.duke.edu/pipermail/rpm-metadata/2007-March/000756.html Patch from Mike Bonnet --- Makefile | 1 + docs/createrepo.8 | 7 ++ genpkgmetadata.py | 128 ++++++++++++++++++----------- readMetadata.py | 199 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 287 insertions(+), 48 deletions(-) create mode 100644 readMetadata.py diff --git a/Makefile b/Makefile index bd0d973..b83bef5 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,7 @@ SUBDIRS = bin docs MODULES = $(srcdir)/genpkgmetadata.py \ $(srcdir)/dumpMetadata.py \ + $(srcdir)/readMetadata.py \ $(srcdir)/modifyrepo.py .SUFFIXES: .py .pyc diff --git a/docs/createrepo.8 b/docs/createrepo.8 index 8bab10e..358ee85 100644 --- a/docs/createrepo.8 +++ b/docs/createrepo.8 @@ -32,6 +32,13 @@ cache of checksums of packages in the repository. In consecutive runs of createrepo over the same repository of files that do not have a complete change out of all packages this decreases the processing time dramatically. .br +.IP "\fB\--update\fP" +If metadata already exists in the outputdir and an rpm is unchanged +(based on file size and mtime) since the metadata was generated, reuse +the existing metadata rather than recalculating it. In the case of a +large repository with only a few new or modified rpms this can +significantly reduce I/O and processing time. +.br .IP "\fB\-C --checkts\fP" Don't generate repo metadata, if their timestamps are newer than its rpms. This option decreases the processing time drastically again, if you happen diff --git a/genpkgmetadata.py b/genpkgmetadata.py index 4647d47..22b7c85 100755 --- a/genpkgmetadata.py +++ b/genpkgmetadata.py @@ -30,6 +30,7 @@ import fnmatch import shutil import dumpMetadata +import readMetadata from dumpMetadata import _gzipOpen __version__ = '0.4.9' @@ -61,6 +62,7 @@ def usage(retval=1): -h, --help = show this help -V, --version = output version -p, --pretty = output xml files in pretty format. + --update = update existing metadata (if present) -d, --database = generate the sqlite databases. """) @@ -124,6 +126,18 @@ class MetaDataGenerator: """all the heavy lifting for the package metadata""" # rpms we're going to be dealing with + if self.cmds['update']: + #build the paths + basefile = os.path.join(self.cmds['outputdir'], self.cmds['finaldir'], self.cmds['primaryfile']) + flfile = os.path.join(self.cmds['outputdir'], self.cmds['finaldir'], self.cmds['filelistsfile']) + otherfile = os.path.join(self.cmds['outputdir'], self.cmds['finaldir'], self.cmds['otherfile']) + opts = { + 'verbose' : self.cmds['verbose'], + 'pkgdir' : os.path.normpath(os.path.join(self.cmds['basedir'], directory)) + } + #and scan the old repo + self.oldData = readMetadata.MetadataIndex(self.cmds['outputdir'], + basefile, flfile, otherfile, opts) files = self.getFileList(self.cmds['basedir'], directory, '.rpm') files = self.trimRpms(files) self.pkgcount = len(files) @@ -174,61 +188,76 @@ class MetaDataGenerator: self.otherfile.write('\n' % self.pkgcount) + def _getNodes(self, file, directory, current): + basenode = None + filesnode = None + othernode = None + try: + rpmdir= os.path.join(self.cmds['basedir'], directory) + mdobj = dumpMetadata.RpmMetaData(self.ts, rpmdir, file, self.cmds) + except dumpMetadata.MDError, e: + errorprint('\n%s - %s' % (e, file)) + return None + try: + basenode = dumpMetadata.generateXML(self.basedoc, self.baseroot, self.formatns, mdobj, self.cmds['sumtype']) + except dumpMetadata.MDError, e: + errorprint(_('\nAn error occurred creating primary metadata: %s') % e) + return None + try: + filesnode = dumpMetadata.fileListXML(self.filesdoc, self.filesroot, mdobj) + except dumpMetadata.MDError, e: + errorprint(_('\nAn error occurred creating filelists: %s') % e) + return None + try: + othernode = dumpMetadata.otherXML(self.otherdoc, self.otherroot, mdobj) + except dumpMetadata.MDError, e: + errorprint(_('\nAn error occurred: %s') % e) + return None + return basenode,filesnode,othernode + def writeMetadataDocs(self, files, directory, current=0): for file in files: current+=1 - try: - rpmdir= os.path.join(self.cmds['basedir'], directory) - mdobj = dumpMetadata.RpmMetaData(self.ts, rpmdir, file, self.cmds) - if not self.cmds['quiet']: - if self.cmds['verbose']: - print '%d/%d - %s' % (current, len(files), file) - else: - sys.stdout.write('\r' + ' ' * 80) - sys.stdout.write("\r%d/%d - %s" % (current, self.pkgcount, file)) - sys.stdout.flush() - except dumpMetadata.MDError, e: - errorprint('\n%s - %s' % (e, file)) - continue - else: - try: - node = dumpMetadata.generateXML(self.basedoc, self.baseroot, self.formatns, mdobj, self.cmds['sumtype']) - except dumpMetadata.MDError, e: - errorprint(_('\nAn error occurred creating primary metadata: %s') % e) - continue + recycled = False + sep = '-' + if self.cmds['update']: + #see if we can pull the nodes from the old repo + nodes = self.oldData.getNodes(file) + if nodes is not None: + recycled = True + sep = '*' + if not recycled: + #scan rpm files + nodes = self._getNodes(file, directory, current) + if nodes is None: + return + basenode, filenode, othernode = nodes + del nodes + if not self.cmds['quiet']: + if self.cmds['verbose']: + print '%d/%d %s %s' % (current, self.pkgcount, sep, file) else: - output = node.serialize('UTF-8', self.cmds['pretty']) - self.basefile.write(output) - self.basefile.write('\n') - node.unlinkNode() - node.freeNode() - del node + sys.stdout.write('\r' + ' ' * 80) + sys.stdout.write("\r%d/%d %s %s" % (current, self.pkgcount, sep, file)) + sys.stdout.flush() + if basenode is None: + continue - try: - node = dumpMetadata.fileListXML(self.filesdoc, self.filesroot, mdobj) - except dumpMetadata.MDError, e: - errorprint(_('\nAn error occurred creating filelists: %s') % e) - continue - else: - output = node.serialize('UTF-8', self.cmds['pretty']) - self.flfile.write(output) - self.flfile.write('\n') + for node, outfile in ((basenode,self.basefile), + (filenode,self.flfile), + (othernode,self.otherfile)): + if node is None: + break + output = node.serialize('UTF-8', self.cmds['pretty']) + outfile.write(output) + outfile.write('\n') + if not recycled: + #recycled nodes can be multiply referenced node.unlinkNode() node.freeNode() - del node + if recycled: + self.oldData.freeNodes(file) - try: - node = dumpMetadata.otherXML(self.otherdoc, self.otherroot, mdobj) - except dumpMetadata.MDError, e: - errorprint(_('\nAn error occurred: %s') % e) - continue - else: - output = node.serialize('UTF-8', self.cmds['pretty']) - self.otherfile.write(output) - self.otherfile.write('\n') - node.unlinkNode() - node.freeNode() - del node return current @@ -379,6 +408,7 @@ def parseArgs(args): cmds['checkts'] = False cmds['mdtimestamp'] = 0 cmds['split'] = False + cmds['update'] = False cmds['outputdir'] = "" cmds['database'] = False cmds['file-pattern-match'] = ['.*bin\/.*', '^\/etc\/.*', '^\/usr\/lib\/sendmail$'] @@ -390,7 +420,7 @@ def parseArgs(args): 'quiet', 'verbose', 'cachedir=', 'basedir=', 'baseurl=', 'groupfile=', 'checksum=', 'version', 'pretty', 'split', 'outputdir=', - 'noepoch', 'checkts', 'database', + 'noepoch', 'checkts', 'database', 'update', 'skip-symlinks']) except getopt.error, e: errorprint(_('Options Error: %s.') % e) @@ -449,6 +479,8 @@ def parseArgs(args): elif arg in ['-c', '--cachedir']: cmds['cache'] = True cmds['cachedir'] = a + elif arg == '--update': + cmds['update'] = True elif arg in ['-C', '--checkts']: cmds['checkts'] = True elif arg == '--basedir': diff --git a/readMetadata.py b/readMetadata.py new file mode 100644 index 0000000..60e6129 --- /dev/null +++ b/readMetadata.py @@ -0,0 +1,199 @@ +#!/usr/bin/python -t + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# Copyright 2006 Red Hat + +import os +import sys +import libxml2 +import pprint +import stat + +def errorprint(stuff): + print >> sys.stderr, stuff + +def _(args): + """Stub function for translation""" + return args + +class MetadataIndex(object): + + def __init__(self, outputdir, basefile, filelistfile, otherfile, opts=None): + if opts is None: + opts = {} + self.opts = opts + self.outputdir = outputdir + self.files = {'base' : basefile, + 'filelist' : filelistfile, + 'other' : otherfile} + self.scan() + + def scan(self): + """Read in and index old repo data""" + self.basenodes = {} + self.filesnodes = {} + self.othernodes = {} + self.pkg_ids = {} + if self.opts.get('verbose'): + print _("Scanning old repo data") + for file in self.files.values(): + if not os.path.exists(file): + #cannot scan + errorprint(_("Previous repo file missing: %s") % file) + return + root = libxml2.parseFile(self.files['base']).getRootElement() + self._scanPackageNodes(root, self._handleBase) + if self.opts.get('verbose'): + print _("Indexed %i base nodes" % len(self.basenodes)) + root = libxml2.parseFile(self.files['filelist']).getRootElement() + self._scanPackageNodes(root, self._handleFiles) + if self.opts.get('verbose'): + print _("Indexed %i filelist nodes" % len(self.filesnodes)) + root = libxml2.parseFile(self.files['other']).getRootElement() + self._scanPackageNodes(root, self._handleOther) + if self.opts.get('verbose'): + print _("Indexed %i other nodes" % len(self.othernodes)) + #reverse index pkg ids to track references + self.pkgrefs = {} + for relpath, pkgid in self.pkg_ids.iteritems(): + self.pkgrefs.setdefault(pkgid,[]).append(relpath) + + def _scanPackageNodes(self, root, handler): + node = root.children + while node is not None: + if node.type != "element": + node = node.next + continue + if node.name == "package": + handler(node) + node = node.next + + def _handleBase(self, node): + top = node + node = node.children + pkgid = None + mtime = None + size = None + relpath = None + while node is not None: + if node.type != "element": + node = node.next + continue + if node.name == "checksum": + pkgid = node.content + elif node.name == "time": + mtime = int(node.prop('file')) + elif node.name == "size": + size = int(node.prop('package')) + elif node.name == "location": + relpath = node.prop('href') + node = node.next + if relpath is None: + print _("Incomplete data for node") + return + if pkgid is None: + print _("pkgid missing for %s") % relpath + return + if mtime is None: + print _("mtime missing for %s") % relpath + return + if size is None: + print _("size missing for %s") % relpath + return + filepath = os.path.join(self.opts['pkgdir'], relpath) + try: + st = os.stat(filepath) + except OSError: + #file missing -- ignore + return + if not stat.S_ISREG(st.st_mode): + #ignore non files + return + #check size and mtime + if st.st_size != size: + if self.opts.get('verbose'): + print _("Size (%i -> %i) changed for file %s") % (size,st.st_size,filepath) + return + if st.st_mtime != mtime: + if self.opts.get('verbose'): + print _("Modification time changed for %s") % filepath + return + #otherwise we index + self.basenodes[relpath] = top + self.pkg_ids[relpath] = pkgid + + def _handleFiles(self, node): + pkgid = node.prop('pkgid') + if pkgid: + self.filesnodes[pkgid] = node + + def _handleOther(self, node): + pkgid = node.prop('pkgid') + if pkgid: + self.othernodes[pkgid] = node + + def getNodes(self, relpath): + """Return base, filelist, and other nodes for file, if they exist + + Returns a tuple of nodes, or None if not found + """ + bnode = self.basenodes.get(relpath,None) + if bnode is None: + return None + pkgid = self.pkg_ids.get(relpath,None) + if pkgid is None: + print _("No pkgid found for: %s") % relpath + return None + fnode = self.filesnodes.get(pkgid,None) + if fnode is None: + return None + onode = self.othernodes.get(pkgid,None) + if onode is None: + return None + return bnode, fnode, onode + + def freeNodes(self,relpath): + #causing problems + """Free up nodes corresponding to file, if possible""" + bnode = self.basenodes.get(relpath,None) + if bnode is None: + print "Missing node for %s" % relpath + return + bnode.unlinkNode() + bnode.freeNode() + del self.basenodes[relpath] + pkgid = self.pkg_ids.get(relpath,None) + if pkgid is None: + print _("No pkgid found for: %s") % relpath + return None + del self.pkg_ids[relpath] + dups = self.pkgrefs.get(pkgid) + dups.remove(relpath) + if len(dups): + #still referenced + return + del self.pkgrefs[pkgid] + for nodes in self.filesnodes, self.othernodes: + node = nodes.get(pkgid) + if node is not None: + node.unlinkNode() + node.freeNode() + del nodes[pkgid] + + +if __name__ == "__main__": + #test code - attempts to read a repo in working directory + idx = MetadataIndex(".", "repodata/primary.xml.gz", "repodata/filelists.xml.gz", + "repodata/other.xml.gz", {'verbose':1}) -- 2.34.1