src/third_party/chromite/lib/paygen/download_cache.py

   1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Downloads files upon request in a thread/process safe way.
   6
   7 DEPRECATED: Should be merged into chromite.lib.cache.
   8 """
   9
  10 from __future__ import print_function
  11
  12 import md5
  13 import os
  14 import shutil
  15 import stat
  16 import tempfile
  17 import time
  18
  19 import fixup_path
  20 fixup_path.FixupPath()
  21
  22 from chromite.lib.paygen import flock
  23 from chromite.lib.paygen import urilib
  24 from chromite.lib.paygen import utils
  25
  26
  27 FETCH_RETRY_COUNT = 10
  28 DEFAULT_DAYS_TO_KEEP = 1
  29 ONE_DAY = 24 * 60 * 60
  30
  31
  32 class RetriesExhaustedError(Exception):
  33   """Raised when we make too many attempts to download the same file."""
  34
  35
  36 class DownloadCache(object):
  37   """This class downloads files into a local directory upon request.
  38
  39   This classes uses locking to make this safe across processes, and
  40   threads.
  41
  42   Example usage:
  43
  44     # This will create the cache dir, and purge old contents.
  45     cache = DownloadCache('/tmp/my_cache')
  46
  47     # file is copied into file, blocking for download if needed.
  48     cache.GetFileCopy('gs://bucket/foo', '/tmp/foo')
  49
  50     # file is loaded into cache, but not locked.
  51     tempfile = cache.GetFileInTempFile('gs://bucket/foo')
  52     tempfile.close()
  53   """
  54
  55   # Name of the purge management lock over the entire cache.
  56   _CACHE_LOCK = 'cache.lock'
  57   _FILE_DIR = 'cache'
  58   _LOCK_DIR = 'lock'
  59
  60   _GET_FILE_SPIN_DELAY = 2
  61
  62   def __init__(self, cache_dir, max_age=ONE_DAY, cache_size=None):
  63     """Create a DownloadCache.
  64
  65     Since Purging is not performed very often, we can exceed max_age or
  66     cache_size.
  67
  68     Args:
  69       cache_dir: The directory in which to create the cache.
  70       max_age: Purge files not used for this number of seconds. None for no
  71                max_age.
  72       cache_size: Purge the least recently used files until the cache is
  73                   below this size in bytes. None for no size limit.
  74
  75       If no condition is provided, we purge all files unused for one full day.
  76     """
  77     # One directory for cached files, one for lock files.
  78     self._cache_dir = os.path.realpath(cache_dir)
  79     self._file_dir = os.path.join(self._cache_dir, self._FILE_DIR)
  80     self._lock_dir = os.path.join(self._cache_dir, self._LOCK_DIR)
  81
  82     self._max_age = max_age
  83     self._cache_size = cache_size
  84
  85     self._SetupCache()
  86
  87   def _SetupCache(self):
  88     """Make sure that our cache contains only files/directories we expect."""
  89     try:
  90       # The purge lock ensures nobody else is modifying the cache in any way.
  91       with self._PurgeLock(blocking=False, shared=False):
  92         # We have changed the layout of our cache directories over time.
  93         # Clean up any left over files.
  94         expected = (self._CACHE_LOCK, self._FILE_DIR, self._LOCK_DIR)
  95         unexpected = set(os.listdir(self._cache_dir)).difference(expected)
  96
  97         for name in unexpected:
  98           filename = os.path.join(self._cache_dir, name)
  99           if os.path.isdir(filename):
 100             shutil.rmtree(filename)
 101           else:
 102             os.unlink(filename)
 103
 104         # Create the cache file dir if needed.
 105         if not os.path.exists(self._file_dir):
 106           os.makedirs(self._file_dir)
 107
 108         # Create the lock dir if needed.
 109         if not os.path.exists(self._lock_dir):
 110           os.makedirs(self._lock_dir)
 111     except flock.LockNotAcquired:
 112       # If we can't get an exclusive lock on the cache, someone else set it up.
 113       pass
 114
 115   def _UriToCacheFile(self, uri):
 116     """Convert a URI to an cache file (full path).
 117
 118     Args:
 119       uri: The uri of the file to be cached locally.
 120
 121     Returns:
 122       The full path file name of the cache file associated with a given URI.
 123     """
 124     # We use the md5 hash of the URI as our file name. This allows us to
 125     # store all cache files in a single directory, which removes race
 126     # conditions around directories.
 127     m = md5.new(uri)
 128     return os.path.join(self._file_dir, m.digest().encode('hex'))
 129
 130   def _PurgeLock(self, blocking=False, shared=False):
 131     """Acquire a lock on the cache as a whole.
 132
 133     An exclusive lock proves nobody else will modify anything, and nobody
 134     else will hold any _CacheFileLocks. A shared lock is required before
 135     getting any kind of _CacheFileLock.
 136
 137     Args:
 138       blocking: Block until the lock is available?
 139       shared: Get a shared lock, or an exclusive lock?
 140
 141     Returns:
 142       flock.Lock (not acquired)
 143     """
 144     return flock.Lock(lock_name=self._CACHE_LOCK,
 145                       lock_dir=self._cache_dir,
 146                       blocking=blocking,
 147                       shared=shared)
 148
 149   def _CacheFileLock(self, cache_file, blocking=False, shared=False):
 150     """Acquire a lock on a file in the cache.
 151
 152     A shared lock will ensure no other processes are modifying the file, but
 153     getting it does not ensure that the file in question actually exists.
 154
 155     An exclusive lock is required to modify a cache file, this usually means
 156     downloading it.
 157
 158     A shared _PurgeLock should be held before trying to acquire any type
 159     of cache file lock.
 160
 161     Args:
 162       cache_file: The full path of file in cache to lock.
 163       blocking: Block until the lock is available?
 164       shared: Get a shared lock, or an exclusive lock?
 165
 166     Returns:
 167       flock.Lock (not acquired)
 168     """
 169     return flock.Lock(lock_name=os.path.basename(cache_file),
 170                       lock_dir=self._lock_dir,
 171                       blocking=blocking,
 172                       shared=shared)
 173
 174   def Purge(self, max_age=None, cache_size=None):
 175     """Attempts to clean up the cache contents.
 176
 177     Is a no-op if cache lock is not acquirable.
 178
 179     Args:
 180       max_age: Overrides the __init__ max_age for this one
 181                        purge. Mostly intended for unittests.
 182       cache_size: Overrides the __init__ cache_size for this one
 183                        purge. Mostly intended for unittests.
 184     """
 185     max_age = self._max_age if max_age is None else max_age
 186     cache_size = self._cache_size if cache_size is None else cache_size
 187
 188     try:
 189       # Prevent other changes while we purge the cache.
 190       with self._PurgeLock(shared=False, blocking=False):
 191
 192         # Purge files based on age, if specified.
 193         if max_age is not None:
 194           now = time.time()
 195           for f in utils.ListdirFullpath(self._file_dir):
 196             if (now - os.path.getmtime(f)) > max_age:
 197               os.unlink(f)
 198
 199         # Purge files based on size, if specified.
 200         if cache_size is not None:
 201           # Find cache files, and sort them so the oldest are first.
 202           # This defines which ones we will purge first.
 203           cache_files = utils.ListdirFullpath(self._file_dir)
 204           cache_files.sort(key=os.path.getmtime)
 205
 206           sizes = [os.path.getsize(f) for f in cache_files]
 207           total_size = sum(sizes)
 208
 209           # Remove files until we are small enough to fit.
 210           for f, size in zip(cache_files, sizes):
 211             if total_size < cache_size:
 212               break
 213             total_size -= size
 214             os.unlink(f)
 215
 216         # Just remove all lock files. They will be recreated as needed.
 217         shutil.rmtree(self._lock_dir)
 218         os.makedirs(self._lock_dir)
 219
 220     except flock.LockNotAcquired:
 221       # If we can't get an exclusive lock on the file, it's in use, leave it.
 222       pass
 223
 224   def _FetchIntoCache(self, uri, cache_file):
 225     """This function downloads the specified file (if not already local).
 226
 227     You must hold the PurgeLock when calling this method.
 228
 229     If it can't get an exclusive lock, or if the file is already present,
 230     it does nothing.
 231
 232     Args:
 233       uri: uri of the file to download.
 234       cache_file: location in the cache to download too.
 235
 236     Returns:
 237       True if a file was downloaded, False otherwise. (used in unittests)
 238
 239     Raises:
 240       May raise any download error associated with the URI's protocol.
 241     """
 242     try:
 243       # Write protect the file before modifying it.
 244       with self._CacheFileLock(cache_file, shared=False, blocking=False):
 245         if os.path.exists(cache_file):
 246           return False
 247
 248         try:
 249           # Actually download the file.
 250           urilib.Copy(uri, cache_file)
 251           # Make the file read-only by everyone.
 252           os.chmod(cache_file, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 253         except:
 254           # If there was any error with the download, make sure no partial
 255           # file was left behind.
 256           if os.path.exists(cache_file):
 257             os.unlink(cache_file)
 258           raise
 259
 260     except flock.LockNotAcquired:
 261       # In theory, if it's already locked, that either means a download is in
 262       # progress, or there is a shared lock which means it's already present.
 263       return False
 264
 265     # Try to cleanup the cache after we just grew it.
 266     self.Purge()
 267     return True
 268
 269   def GetFileObject(self, uri):
 270     """Get an open readonly File object for the file in the cache.
 271
 272     This method will populate the cache with the requested file if it's
 273     not already present, and will return an already opened read only file
 274     object for the cache contents.
 275
 276     Even if the file is purged, this File object will remain valid until
 277     closed. Since this method is the only legitimate way to get access to
 278     a file in the cache, and it returns read only Files, cache files should
 279     never be modified.
 280
 281     This method may block while trying to download and/or lock the file.
 282
 283     Args:
 284       uri: The uri of the file to access.
 285
 286     Returns:
 287       File object opened with 'rb' mode.
 288
 289     Raises:
 290       Exceptions from a failed download are passed through 'as is' from
 291       the underlying download mechanism.
 292
 293       RetriesExhaustedError if we need a large number of attempts to
 294       download the same file.
 295     """
 296     cache_file = self._UriToCacheFile(uri)
 297
 298     # We keep trying until we succeed, or throw an exception.
 299     for _ in xrange(FETCH_RETRY_COUNT):
 300       with self._PurgeLock(shared=True, blocking=True):
 301         # Attempt to download the file, if needed.
 302         self._FetchIntoCache(uri, cache_file)
 303
 304         # Get a shared lock on the file. This can block if another process
 305         # has a non-shared lock (ie: they are downloading)
 306         with self._CacheFileLock(cache_file, shared=True, blocking=True):
 307
 308           if os.path.exists(cache_file):
 309             fd = open(cache_file, 'rb')
 310
 311             # Touch the timestamp on cache file to help purging logic.
 312             os.utime(cache_file, None)
 313
 314             return fd
 315           else:
 316             # We don't have the file in our cache. There are three ways this
 317             # can happen:
 318             #
 319             # A) Another process was trying to download, blocked our download,
 320             #    then got a download error.
 321             # B) Another process removed the file(illegally). We will recover as
 322             #    soon as all read-only locks are released.
 323             # C) Our download failed without throwing an exception. We will
 324             #    block forever if this continues to happen.
 325
 326             # Sleep so we don't spin too quickly, then try again.
 327             time.sleep(self._GET_FILE_SPIN_DELAY)
 328
 329     raise RetriesExhaustedError(uri)
 330
 331   def GetFileCopy(self, uri, filepath):
 332     """Copy a cache file into your file (downloading as needed).
 333
 334     Copy the file into your specified filename (creating or overridding). It
 335     will be downloaded into the cache first, if needed. It is your
 336     responsibility to manage filepath after it is populated.
 337
 338     Args:
 339       uri: The uri of the file to access.
 340       filepath: The name of the file to copy uri contents into.
 341
 342     Raises:
 343       Exceptions from a failed download are passed through 'as is' from
 344       the underlying download mechanism.
 345     """
 346     with self.GetFileObject(uri) as src:
 347       with open(filepath, 'wb+') as dest:
 348         shutil.copyfileobj(src, dest)
 349
 350   def GetFileInTempFile(self, uri):
 351     """Copy a cache file into a tempfile (downloading as needed).
 352
 353     The cache file is copied into a tempfile.NamedTemporaryFile.
 354
 355     This file is owned strictly by the caller and can be modified/deleted as
 356     needed. Closing the NamedTemporaryFile will delete it.
 357
 358     Args:
 359       uri: The uri of the file to access.
 360
 361     Returns:
 362       tempfile.NamedTemporaryFile containing the requested file.
 363       NamedTemporaryFile.name will contain the file's name.
 364
 365     Raises:
 366       Exceptions from a failed download are passed through 'as is' from
 367       the underlying download mechanism.
 368     """
 369     temp = tempfile.NamedTemporaryFile()
 370     self.GetFileCopy(uri, temp.name)
 371     return temp
 372
 373   # Cache objects can be used with "with" statements.
 374   def __enter__(self):
 375     return self
 376
 377   def __exit__(self, _type, _value, _traceback):
 378     self.Purge()