Upstream version 9.38.198.0

[platform/framework/web/crosswalk.git] / src / third_party / skia / gm / rebaseline_server / imagediffdb.py
diff --git a/src/third_party/skia/gm/rebaseline_server/imagediffdb.py b/src/third_party/skia/gm/rebaseline_server/imagediffdb.py

index 6b68414..3c5f45f 100644 (file)
--- a/src/third_party/skia/gm/rebaseline_server/imagediffdb.py
+++ b/src/third_party/skia/gm/rebaseline_server/imagediffdb.py
@@ -9,40 +9,41 @@ found in the LICENSE file.
  Calulate differences between image pairs, and store them in a database.
  """
  
+# System-level imports
  import contextlib
-import csv
+import errno
+import json
  import logging
  import os
+import Queue
  import re
  import shutil
-import sys
  import tempfile
+import threading
+import time
  import urllib
-try:
-  from PIL import Image, ImageChops
-except ImportError:
-  raise ImportError('Requires PIL to be installed; see '
-                    + 'http://www.pythonware.com/products/pil/')
-
-# Set the PYTHONPATH to include the tools directory.
-sys.path.append(
-    os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir,
-                        'tools'))
+
+# Must fix up PYTHONPATH before importing from within Skia
+import fix_pythonpath  # pylint: disable=W0611
+
+# Imports from within Skia
  import find_run_binary
+from py.utils import gs_utils
+
  
  SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
  
  DEFAULT_IMAGE_SUFFIX = '.png'
  DEFAULT_IMAGES_SUBDIR = 'images'
+# TODO(epoger): Figure out a better default number of threads; for now,
+# using a conservative default value.
+DEFAULT_NUM_WORKER_THREADS = 1
  
  DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
  
-DIFFS_SUBDIR = 'diffs'
+RGBDIFFS_SUBDIR = 'diffs'
  WHITEDIFFS_SUBDIR = 'whitediffs'
  
-VALUES_PER_BAND = 256
-
  # Keys used within DiffRecord dictionary representations.
  # NOTE: Keep these in sync with static/constants.js
  KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
@@ -50,11 +51,23 @@ KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
  KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
  KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
  
+# Special values within ImageDiffDB._diff_dict
+_DIFFRECORD_FAILED = 'failed'
+_DIFFRECORD_PENDING = 'pending'
+
+# How often to report tasks_queue size
+QUEUE_LOGGING_GRANULARITY = 1000
+
+# Temporary variable to keep track of how many times we download
+# the same file in multiple threads.
+# TODO(epoger): Delete this, once we see that the number stays close to 0.
+global_file_collisions = 0
+
  
  class DiffRecord(object):
    """ Record of differences between two images. """
  
-  def __init__(self, storage_root,
+  def __init__(self, gs, storage_root,
                 expected_image_url, expected_image_locator,
                 actual_image_url, actual_image_locator,
                 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
@@ -63,18 +76,16 @@ class DiffRecord(object):
      """Download this pair of images (unless we already have them on local disk),
      and prepare a DiffRecord for them.
  
-    TODO(epoger): Make this asynchronously download images, rather than blocking
-    until the images have been downloaded and processed.
-
      Args:
+      gs: instance of GSUtils object we can use to download images
        storage_root: root directory on local disk within which we store all
            images
-      expected_image_url: file or HTTP url from which we will download the
+      expected_image_url: file, GS, or HTTP url from which we will download the
            expected image
        expected_image_locator: a unique ID string under which we will store the
            expected image within storage_root (probably including a checksum to
            guarantee uniqueness)
-      actual_image_url: file or HTTP url from which we will download the
+      actual_image_url: file, GS, or HTTP url from which we will download the
            actual image
        actual_image_locator: a unique ID string under which we will store the
            actual image within storage_root (probably including a checksum to
@@ -87,91 +98,92 @@ class DiffRecord(object):
      actual_image_locator = _sanitize_locator(actual_image_locator)
  
      # Download the expected/actual images, if we don't have them already.
-    # TODO(rmistry): Add a parameter that makes _download_and_open_image raise
-    # an exception if images are not found locally (instead of trying to
-    # download them).
      expected_image_file = os.path.join(
          storage_root, expected_images_subdir,
          str(expected_image_locator) + image_suffix)
      actual_image_file = os.path.join(
          storage_root, actual_images_subdir,
          str(actual_image_locator) + image_suffix)
+    for image_file, image_url in [
+        (expected_image_file, expected_image_url),
+        (actual_image_file, actual_image_url)]:
+      if image_file and image_url:
+        try:
+          _download_file(gs, image_file, image_url)
+        except Exception:
+          logging.exception('unable to download image_url %s to file %s' %
+                            (image_url, image_file))
+          raise
+
+    # Return early if we do not need to generate diffs.
+    if (expected_image_url == actual_image_url or
+        not expected_image_url or not actual_image_url):
+      return
+
+    # Get all diff images and values using the skpdiff binary.
+    skpdiff_output_dir = tempfile.mkdtemp()
      try:
-      expected_image = _download_and_open_image(
-          expected_image_file, expected_image_url)
-    except Exception:
-      logging.exception('unable to download expected_image_url %s to file %s' %
-                        (expected_image_url, expected_image_file))
-      raise
-    try:
-      actual_image = _download_and_open_image(
-          actual_image_file, actual_image_url)
-    except Exception:
-      logging.exception('unable to download actual_image_url %s to file %s' %
-                        (actual_image_url, actual_image_file))
-      raise
-
-    # Generate the diff image (absolute diff at each pixel) and
-    # max_diff_per_channel.
-    diff_image = _generate_image_diff(actual_image, expected_image)
-    diff_histogram = diff_image.histogram()
-    (diff_width, diff_height) = diff_image.size
-    self._max_diff_per_channel = _max_per_band(diff_histogram)
-
-    # Generate the whitediff image (any differing pixels show as white).
-    # This is tricky, because when you convert color images to grayscale or
-    # black & white in PIL, it has its own ideas about thresholds.
-    # We have to force it: if a pixel has any color at all, it's a '1'.
-    bands = diff_image.split()
-    graydiff_image = ImageChops.lighter(ImageChops.lighter(
-        bands[0], bands[1]), bands[2])
-    whitediff_image = (graydiff_image.point(lambda p: p > 0 and VALUES_PER_BAND)
-                                     .convert('1', dither=Image.NONE))
-
-    # Calculate the perceptual difference percentage.
-    skpdiff_csv_dir = tempfile.mkdtemp()
-    try:
-      skpdiff_csv_output = os.path.join(skpdiff_csv_dir, 'skpdiff-output.csv')
-      expected_img = os.path.join(storage_root, expected_images_subdir,
-                                  str(expected_image_locator) + image_suffix)
-      actual_img = os.path.join(storage_root, actual_images_subdir,
-                                str(actual_image_locator) + image_suffix)
+      skpdiff_summary_file = os.path.join(skpdiff_output_dir,
+                                          'skpdiff-output.json')
+      skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
+      skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
+      _mkdir_unless_exists(skpdiff_rgbdiff_dir)
+      _mkdir_unless_exists(skpdiff_rgbdiff_dir)
+
+      # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
+      # instead of calling it separately for each image pair.
+      # Pro: we'll incur less overhead from making repeated system calls,
+      # spinning up the skpdiff binary, etc.
+      # Con: we would have to wait until all image pairs were loaded before
+      # generating any of the diffs?
+      # Note(stephana): '--longnames' was added to allow for this 
+      # case (multiple files at once) versus specifying output diffs 
+      # directly.
        find_run_binary.run_command(
-          [SKPDIFF_BINARY, '-p', expected_img, actual_img,
-           '--csv', skpdiff_csv_output, '-d', 'perceptual'])
-      with contextlib.closing(open(skpdiff_csv_output)) as csv_file:
-        for row in csv.DictReader(csv_file):
-          perceptual_similarity = float(row[' perceptual'].strip())
-          if not 0 <= perceptual_similarity <= 1:
-            # skpdiff outputs -1 if the images are different sizes. Treat any
-            # output that does not lie in [0, 1] as having 0% perceptual
-            # similarity.
-            perceptual_similarity = 0
-          # skpdiff returns the perceptual similarity, convert it to get the
-          # perceptual difference percentage.
-          self._perceptual_difference = 100 - (perceptual_similarity * 100)
+          [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
+           '--jsonp', 'false',
+           '--longnames', 'true',
+           '--output', skpdiff_summary_file,
+           '--differs', 'perceptual', 'different_pixels',
+           '--rgbDiffDir', skpdiff_rgbdiff_dir,
+           '--whiteDiffDir', skpdiff_whitediff_dir,
+           ])
+
+      # Get information out of the skpdiff_summary_file.
+      with contextlib.closing(open(skpdiff_summary_file)) as fp:
+        data = json.load(fp)
+
+      # For now, we can assume there is only one record in the output summary,
+      # since we passed skpdiff only one pair of images.
+      record = data['records'][0]
+      self._width = record['width']
+      self._height = record['height']
+      # TODO: make max_diff_per_channel a tuple instead of a list, because the
+      # structure is meaningful (first element is red, second is green, etc.)
+      # See http://stackoverflow.com/a/626871
+      self._max_diff_per_channel = [
+          record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
+      per_differ_stats = record['diffs']
+      for stats in per_differ_stats:
+        differ_name = stats['differName']
+        if differ_name == 'different_pixels':
+          self._num_pixels_differing = stats['pointsOfInterest']
+        elif differ_name == 'perceptual':
+          perceptual_similarity = stats['result']
+
+      # skpdiff returns the perceptual similarity; convert it to get the
+      # perceptual difference percentage.
+      # skpdiff outputs -1 if the images are different sizes. Treat any
+      # output that does not lie in [0, 1] as having 0% perceptual
+      # similarity.
+      if not 0 <= perceptual_similarity <= 1:
+        perceptual_similarity = 0
+      self._perceptual_difference = 100 - (perceptual_similarity * 100)
      finally:
-      shutil.rmtree(skpdiff_csv_dir)
-
-    # Final touches on diff_image: use whitediff_image as an alpha mask.
-    # Unchanged pixels are transparent; differing pixels are opaque.
-    diff_image.putalpha(whitediff_image)
-
-    # Store the diff and whitediff images generated above.
-    diff_image_locator = _get_difference_locator(
-        expected_image_locator=expected_image_locator,
-        actual_image_locator=actual_image_locator)
-    basename = str(diff_image_locator) + image_suffix
-    _save_image(diff_image, os.path.join(
-        storage_root, DIFFS_SUBDIR, basename))
-    _save_image(whitediff_image, os.path.join(
-        storage_root, WHITEDIFFS_SUBDIR, basename))
-
-    # Calculate difference metrics.
-    (self._width, self._height) = diff_image.size
-    self._num_pixels_differing = (
-        whitediff_image.histogram()[VALUES_PER_BAND - 1])
+      shutil.rmtree(skpdiff_output_dir)
  
+  # TODO(epoger): Use properties instead of getters throughout.
+  # See http://stackoverflow.com/a/6618176
    def get_num_pixels_differing(self):
      """Returns the absolute number of pixels that differ."""
      return self._num_pixels_differing
@@ -207,38 +219,127 @@ class ImageDiffDB(object):
    """ Calculates differences between image pairs, maintaining a database of
    them for download."""
  
-  def __init__(self, storage_root):
+  def __init__(self, storage_root, gs=None,
+               num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
      """
      Args:
        storage_root: string; root path within the DB will store all of its stuff
+      gs: instance of GSUtils object we can use to download images
+      num_worker_threads: how many threads that download images and
+          generate diffs simultaneously
      """
      self._storage_root = storage_root
+    self._gs = gs
+
+    # Mechanism for reporting queue size periodically.
+    self._last_queue_size_reported = None
+    self._queue_size_report_lock = threading.RLock()
  
      # Dictionary of DiffRecords, keyed by (expected_image_locator,
      # actual_image_locator) tuples.
+    # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
+    #
+    # Any thread that modifies _diff_dict must first acquire
+    # _diff_dict_writelock!
+    #
+    # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
+    # remove items from self._diff_dict if they haven't been accessed for a
+    # long time.  We can always regenerate them by diffing the images we
+    # previously downloaded to local disk.
+    # I guess we should figure out how expensive it is to download vs diff the
+    # image pairs... if diffing them is expensive too, we can write these
+    # _diff_dict objects out to disk if there's too many to hold in RAM.
+    # Or we could use virtual memory to handle that automatically.
      self._diff_dict = {}
+    self._diff_dict_writelock = threading.RLock()
+
+    # Set up the queue for asynchronously loading DiffRecords, and start the
+    # worker threads reading from it.
+    # The queue maxsize must be 0 (infinite size queue), so that asynchronous
+    # calls can return as soon as possible.
+    self._tasks_queue = Queue.Queue(maxsize=0)
+    self._workers = []
+    for i in range(num_worker_threads):
+      worker = threading.Thread(target=self.worker, args=(i,))
+      worker.daemon = True
+      worker.start()
+      self._workers.append(worker)
+
+  def log_queue_size_if_changed(self, limit_verbosity=True):
+    """Log the size of self._tasks_queue, if it has changed since the last call.
+
+    Reports the current queue size, using log.info(), unless the queue is the
+    same size as the last time we reported it.
+
+    Args:
+      limit_verbosity: if True, only log if the queue size is a multiple of
+          QUEUE_LOGGING_GRANULARITY
+    """
+    # Acquire the lock, to synchronize access to self._last_queue_size_reported
+    self._queue_size_report_lock.acquire()
+    try:
+      size = self._tasks_queue.qsize()
+      if size == self._last_queue_size_reported:
+        return
+      if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
+        return
+      logging.info('tasks_queue size is %d' % size)
+      self._last_queue_size_reported = size
+    finally:
+      self._queue_size_report_lock.release()
+
+  def worker(self, worker_num):
+    """Launch a worker thread that pulls tasks off self._tasks_queue.
+
+    Args:
+      worker_num: (integer) which worker this is
+    """
+    while True:
+      self.log_queue_size_if_changed()
+      params = self._tasks_queue.get()
+      key, expected_image_url, actual_image_url = params
+      try:
+        diff_record = DiffRecord(
+            self._gs, self._storage_root,
+            expected_image_url=expected_image_url,
+            expected_image_locator=key[0],
+            actual_image_url=actual_image_url,
+            actual_image_locator=key[1])
+      except Exception:
+        logging.exception(
+            'exception while creating DiffRecord for key %s' % str(key))
+        diff_record = _DIFFRECORD_FAILED
+      self._diff_dict_writelock.acquire()
+      try:
+        self._diff_dict[key] = diff_record
+      finally:
+        self._diff_dict_writelock.release()
+
+  @property
+  def storage_root(self):
+    return self._storage_root
  
    def add_image_pair(self,
                       expected_image_url, expected_image_locator,
                       actual_image_url, actual_image_locator):
-    """Download this pair of images (unless we already have them on local disk),
-    and prepare a DiffRecord for them.
+    """Asynchronously prepare a DiffRecord for a pair of images.
+
+    This method will return quickly; calls to get_diff_record() will block
+    until the DiffRecord is available (or we have given up on creating it).
  
-    TODO(epoger): Make this asynchronously download images, rather than blocking
-    until the images have been downloaded and processed.
-    When we do that, we should probably add a new method that will block
-    until all of the images have been downloaded and processed.  Otherwise,
-    we won't know when it's safe to start calling get_diff_record().
-    jcgregorio notes: maybe just make ImageDiffDB thread-safe and create a
-    thread-pool/worker queue at a higher level that just uses ImageDiffDB?
+    If we already have a DiffRecord for this particular image pair, no work
+    will be done.
+
+    If expected_image_url (or its locator) is None, just download actual_image.
+    If actual_image_url (or its locator) is None, just download expected_image.
  
      Args:
-      expected_image_url: file or HTTP url from which we will download the
+      expected_image_url: file, GS, or HTTP url from which we will download the
            expected image
        expected_image_locator: a unique ID string under which we will store the
            expected image within storage_root (probably including a checksum to
            guarantee uniqueness)
-      actual_image_url: file or HTTP url from which we will download the
+      actual_image_url: file, GS, or HTTP url from which we will download the
            actual image
        actual_image_locator: a unique ID string under which we will store the
            actual image within storage_root (probably including a checksum to
@@ -247,133 +348,97 @@ class ImageDiffDB(object):
      expected_image_locator = _sanitize_locator(expected_image_locator)
      actual_image_locator = _sanitize_locator(actual_image_locator)
      key = (expected_image_locator, actual_image_locator)
-    if not key in self._diff_dict:
-      try:
-        new_diff_record = DiffRecord(
-            self._storage_root,
-            expected_image_url=expected_image_url,
-            expected_image_locator=expected_image_locator,
-            actual_image_url=actual_image_url,
-            actual_image_locator=actual_image_locator)
-      except Exception:
-        # If we can't create a real DiffRecord for this (expected, actual) pair,
-        # store None and the UI will show whatever information we DO have.
-        # Fixes http://skbug.com/2368 .
-        logging.exception(
-            'got exception while creating a DiffRecord for '
-            'expected_image_url=%s , actual_image_url=%s; returning None' % (
-                expected_image_url, actual_image_url))
-        new_diff_record = None
-      self._diff_dict[key] = new_diff_record
+    must_add_to_queue = False
+
+    self._diff_dict_writelock.acquire()
+    try:
+      if not key in self._diff_dict:
+        # If we have already requested a diff between these two images,
+        # we don't need to request it again.
+        must_add_to_queue = True
+        self._diff_dict[key] = _DIFFRECORD_PENDING
+    finally:
+      self._diff_dict_writelock.release()
+
+    if must_add_to_queue:
+      self._tasks_queue.put((key, expected_image_url, actual_image_url))
+      self.log_queue_size_if_changed()
  
    def get_diff_record(self, expected_image_locator, actual_image_locator):
      """Returns the DiffRecord for this image pair.
  
-    Raises a KeyError if we don't have a DiffRecord for this image pair.
+    This call will block until the diff record is available, or we were unable
+    to generate it.
+
+    Args:
+      expected_image_locator: a unique ID string under which we will store the
+          expected image within storage_root (probably including a checksum to
+          guarantee uniqueness)
+      actual_image_locator: a unique ID string under which we will store the
+          actual image within storage_root (probably including a checksum to
+          guarantee uniqueness)
+
+    Returns the DiffRecord for this image pair, or None if we were unable to
+    generate one.
      """
      key = (_sanitize_locator(expected_image_locator),
             _sanitize_locator(actual_image_locator))
-    return self._diff_dict[key]
-
+    diff_record = self._diff_dict[key]
  
-# Utility functions
-
-def _max_per_band(histogram):
-  """Given the histogram of an image, return the maximum value of each band
-  (a.k.a. "color channel", such as R/G/B) across the entire image.
-
-  Args:
-    histogram: PIL histogram
-
-  Returns the maximum value of each band within the image histogram, as a list.
-  """
-  max_per_band = []
-  assert(len(histogram) % VALUES_PER_BAND == 0)
-  num_bands = len(histogram) / VALUES_PER_BAND
-  for band in xrange(num_bands):
-    # Assuming that VALUES_PER_BAND is 256...
-    #  the 'R' band makes up indices 0-255 in the histogram,
-    #  the 'G' band makes up indices 256-511 in the histogram,
-    #  etc.
-    min_index = band * VALUES_PER_BAND
-    index = min_index + VALUES_PER_BAND
-    while index > min_index:
-      index -= 1
-      if histogram[index] > 0:
-        max_per_band.append(index - min_index)
-        break
-  return max_per_band
-
-
-def _generate_image_diff(image1, image2):
-  """Wrapper for ImageChops.difference(image1, image2) that will handle some
-  errors automatically, or at least yield more useful error messages.
-
-  TODO(epoger): Currently, some of the images generated by the bots are RGBA
-  and others are RGB.  I'm not sure why that is.  For now, to avoid confusion
-  within the UI, convert all to RGB when diffing.
+    # If we have no results yet, block until we do.
+    while diff_record == _DIFFRECORD_PENDING:
+      time.sleep(1)
+      diff_record = self._diff_dict[key]
  
-  Args:
-    image1: a PIL image object
-    image2: a PIL image object
+    # Once we have the result...
+    if diff_record == _DIFFRECORD_FAILED:
+      logging.error(
+          'failed to create a DiffRecord for expected_image_locator=%s , '
+          'actual_image_locator=%s' % (
+              expected_image_locator, actual_image_locator))
+      return None
+    else:
+      return diff_record
  
-  Returns: per-pixel diffs between image1 and image2, as a PIL image object
-  """
-  try:
-    return ImageChops.difference(image1.convert('RGB'), image2.convert('RGB'))
-  except ValueError:
-    logging.error('Error diffing image1 [%s] and image2 [%s].' % (
-        repr(image1), repr(image2)))
-    raise
  
+# Utility functions
  
-def _download_and_open_image(local_filepath, url):
-  """Open the image at local_filepath; if there is no file at that path,
-  download it from url to that path and then open it.
+def _download_file(gs, local_filepath, url):
+  """Download a file from url to local_filepath, unless it is already there.
  
    Args:
+    gs: instance of GSUtils object, in case the url points at Google Storage
      local_filepath: path on local disk where the image should be stored
-    url: URL from which we can download the image if we don't have it yet
-
-  Returns: a PIL image object
+    url: HTTP or GS URL from which we can download the image if we don't have
+        it yet
    """
+  global global_file_collisions
    if not os.path.exists(local_filepath):
      _mkdir_unless_exists(os.path.dirname(local_filepath))
-    with contextlib.closing(urllib.urlopen(url)) as url_handle:
-      with open(local_filepath, 'wb') as file_handle:
-        shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
-  return _open_image(local_filepath)
-
-
-def _open_image(filepath):
-  """Wrapper for Image.open(filepath) that yields more useful error messages.
-
-  Args:
-    filepath: path on local disk to load image from
-
-  Returns: a PIL image object
-  """
-  try:
-    return Image.open(filepath)
-  except IOError:
-    # If we are unable to load an image from the file, delete it from disk
-    # and we will try to fetch it again next time.  Fixes http://skbug.com/2247
-    logging.error('IOError loading image file %s ; deleting it.' % filepath)
-    os.remove(filepath)
-    raise
-
-
-def _save_image(image, filepath, format='PNG'):
-  """Write an image to disk, creating any intermediate directories as needed.
  
-  Args:
-    image: a PIL image object
-    filepath: path on local disk to write image to
-    format: one of the PIL image formats, listed at
-            http://effbot.org/imagingbook/formats.htm
-  """
-  _mkdir_unless_exists(os.path.dirname(filepath))
-  image.save(filepath, format)
+    # First download the file contents into a unique filename, and
+    # then rename that file.  That way, if multiple threads are downloading
+    # the same filename at the same time, they won't interfere with each
+    # other (they will both download the file, and one will "win" in the end)
+    temp_filename = '%s-%d' % (local_filepath,
+                               threading.current_thread().ident)
+    if gs_utils.GSUtils.is_gs_url(url):
+      (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
+      gs.download_file(source_bucket=bucket, source_path=path,
+                       dest_path=temp_filename)
+    else:
+      with contextlib.closing(urllib.urlopen(url)) as url_handle:
+        with open(temp_filename, 'wb') as file_handle:
+          shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
+
+    # Rename the file to its real filename.
+    # Keep count of how many colliding downloads we encounter;
+    # if it's a large number, we may want to change our download strategy
+    # to minimize repeated downloads.
+    if os.path.exists(local_filepath):
+      global_file_collisions += 1
+    else:
+      os.rename(temp_filename, local_filepath)
  
  
  def _mkdir_unless_exists(path):
@@ -382,8 +447,11 @@ def _mkdir_unless_exists(path):
    Args:
      path: path on local disk
    """
-  if not os.path.isdir(path):
+  try:
      os.makedirs(path)
+  except OSError as e:
+    if e.errno == errno.EEXIST:
+      pass
  
  
  def _sanitize_locator(locator):
@@ -391,24 +459,11 @@ def _sanitize_locator(locator):
    characters will have special meaning in filenames).
  
    Args:
-    locator: string, or something that can be represented as a string
-  """
-  return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
-
-
-def _get_difference_locator(expected_image_locator, actual_image_locator):
-  """Returns the locator string used to look up the diffs between expected_image
-  and actual_image.
-
-  We must keep this function in sync with getImageDiffRelativeUrl() in
-  static/loader.js
-
-  Args:
-    expected_image_locator: locator string pointing at expected image
-    actual_image_locator: locator string pointing at actual image
-
-  Returns: already-sanitized locator where the diffs between expected and
-      actual images can be found
+    locator: string, or something that can be represented as a string.
+        If None or '', it is returned without modification, because empty
+        locators have a particular meaning ("there is no image for this")
    """
-  return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),
-                       _sanitize_locator(actual_image_locator))
+  if locator:
+    return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
+  else:
+    return locator