Replace urllib2 with requests in crawler.py

author Aleksander Mistewicz <a.mistewicz@samsung.com>

Wed, 19 Oct 2016 14:52:39 +0000 (16:52 +0200)

committer Aleksander Mistewicz <a.mistewicz@samsung.com>

Tue, 10 Jan 2017 09:49:05 +0000 (10:49 +0100)
author Aleksander Mistewicz <a.mistewicz@samsung.com>
Wed, 19 Oct 2016 14:52:39 +0000 (16:52 +0200)
committer Aleksander Mistewicz <a.mistewicz@samsung.com>
Tue, 10 Jan 2017 09:49:05 +0000 (10:49 +0100)
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py

index 59436c8..e2e7d81 100755 (executable)
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -19,7 +19,7 @@
  # @author Pawel Wieczorek <p.wieczorek2@samsung.com>
  
  import os
-import urllib2
+import requests
  import time
  import argparse
  import logging
@@ -35,25 +35,20 @@ def crawl(url):
  
      visited.add(url)
  
-    h = urllib2.build_opener()
-    h.addheaders = [('User-agent', 'Prerelease Crawler')]
+    headers = {'User-agent': 'Prerelease Crawler'}
+    r = requests.get(url, headers=headers)
  
-    try:
-        resp = h.open(url)
-    except urllib2.HTTPError as e:
-        print 'Failed to access {url}: {code} - {reason}'\
-            .format(url=url, code=e.code, reason=e.reason)
-
-    html = str(resp.read())
-    soup = bs4.BeautifulSoup(html, 'lxml')
-    links = soup('a')
+    links = set()
+    soup = bs4.BeautifulSoup(r.text, 'html.parser')
+    for link in soup.find_all('a'):
+        links.add(link.get('href'))
  
      discovered = set()
      for link in links:
          if link not in discovered and link not in visited:
-            if link.string.startswith('tizen-common'):
-                logging.debug("Add link to discovered: %s", link['href'])
-                discovered.add(url + link['href'])
+            if link.startswith('tizen-common'):
+                logging.debug("Add link to discovered: %s", link)
+                discovered.add(url + link)
  
      return discovered
  
@@ -68,6 +63,8 @@ def get_modified_paths(discovered, timestamp):
      else:
          return discovered
      logging.info("Previous timestamp: %s", stamp)
+    s = requests.Session()
+    s.headers.update({"If-Modified-Since": stamp})
      for url in discovered:
          logging.debug("Check for MD5SUMS change: %s", url)
          md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
@@ -78,15 +75,17 @@ def get_modified_paths(discovered, timestamp):
              url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS']
          change = False
          for md5sums_url in md5sums_urls:
-            try:
-                u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
-            except urllib2.HTTPError as e:
-                if e.code == 404:
-                    logging.debug("MD5SUMS missing: %s", md5sums_url)
-                    break
-            else:
+            r = s.get(md5sums_url)
+            if r.status_code == requests.codes.ok:
                  logging.debug("MD5SUMS changed: %s", md5sums_url)
                  change = True
+            elif r.status_code == 404:
+                logging.debug("MD5SUMS missing: %s", md5sums_url)
+                break
+            elif r.status_code == 304:
+                logging.debug("MD5SUMS unchanged: %s", md5sums_url)
+            else:
+                logging.warn("MD5SUMS error: %s", md5sums_url)
          if change:
              logging.info("Add to dispatch: %s", url)
              ret.add(url)
author	Aleksander Mistewicz <a.mistewicz@samsung.com>
	Wed, 19 Oct 2016 14:52:39 +0000 (16:52 +0200)
committer	Aleksander Mistewicz <a.mistewicz@samsung.com>
	Tue, 10 Jan 2017 09:49:05 +0000 (10:49 +0100)