Replace urllib2 with requests in crawler.py 98/95298/6
authorAleksander Mistewicz <a.mistewicz@samsung.com>
Wed, 19 Oct 2016 14:52:39 +0000 (16:52 +0200)
committerAleksander Mistewicz <a.mistewicz@samsung.com>
Tue, 10 Jan 2017 09:49:05 +0000 (10:49 +0100)
It uses urllib3 connection pooling what dramatically increases
performance. Also changed link discovery to follow latest guide.

Package "python-requests" is now required.

Change-Id: Ide3e3c9dd290c2ce01137c00fa2f01cd29e35712
Signed-off-by: Aleksander Mistewicz <a.mistewicz@samsung.com>
tsp/scripts/crawler.py

index 59436c8..e2e7d81 100755 (executable)
@@ -19,7 +19,7 @@
 # @author Pawel Wieczorek <p.wieczorek2@samsung.com>
 
 import os
-import urllib2
+import requests
 import time
 import argparse
 import logging
@@ -35,25 +35,20 @@ def crawl(url):
 
     visited.add(url)
 
-    h = urllib2.build_opener()
-    h.addheaders = [('User-agent', 'Prerelease Crawler')]
+    headers = {'User-agent': 'Prerelease Crawler'}
+    r = requests.get(url, headers=headers)
 
-    try:
-        resp = h.open(url)
-    except urllib2.HTTPError as e:
-        print 'Failed to access {url}: {code} - {reason}'\
-            .format(url=url, code=e.code, reason=e.reason)
-
-    html = str(resp.read())
-    soup = bs4.BeautifulSoup(html, 'lxml')
-    links = soup('a')
+    links = set()
+    soup = bs4.BeautifulSoup(r.text, 'html.parser')
+    for link in soup.find_all('a'):
+        links.add(link.get('href'))
 
     discovered = set()
     for link in links:
         if link not in discovered and link not in visited:
-            if link.string.startswith('tizen-common'):
-                logging.debug("Add link to discovered: %s", link['href'])
-                discovered.add(url + link['href'])
+            if link.startswith('tizen-common'):
+                logging.debug("Add link to discovered: %s", link)
+                discovered.add(url + link)
 
     return discovered
 
@@ -68,6 +63,8 @@ def get_modified_paths(discovered, timestamp):
     else:
         return discovered
     logging.info("Previous timestamp: %s", stamp)
+    s = requests.Session()
+    s.headers.update({"If-Modified-Since": stamp})
     for url in discovered:
         logging.debug("Check for MD5SUMS change: %s", url)
         md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
@@ -78,15 +75,17 @@ def get_modified_paths(discovered, timestamp):
             url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS']
         change = False
         for md5sums_url in md5sums_urls:
-            try:
-                u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
-            except urllib2.HTTPError as e:
-                if e.code == 404:
-                    logging.debug("MD5SUMS missing: %s", md5sums_url)
-                    break
-            else:
+            r = s.get(md5sums_url)
+            if r.status_code == requests.codes.ok:
                 logging.debug("MD5SUMS changed: %s", md5sums_url)
                 change = True
+            elif r.status_code == 404:
+                logging.debug("MD5SUMS missing: %s", md5sums_url)
+                break
+            elif r.status_code == 304:
+                logging.debug("MD5SUMS unchanged: %s", md5sums_url)
+            else:
+                logging.warn("MD5SUMS error: %s", md5sums_url)
         if change:
             logging.info("Add to dispatch: %s", url)
             ret.add(url)