From 5211c9869327081c197a8d352cc79fd83d6093d9 Mon Sep 17 00:00:00 2001 From: Aleksander Mistewicz Date: Wed, 19 Oct 2016 16:52:39 +0200 Subject: [PATCH] Replace urllib2 with requests in crawler.py It uses urllib3 connection pooling what dramatically increases performance. Also changed link discovery to follow latest guide. Package "python-requests" is now required. Change-Id: Ide3e3c9dd290c2ce01137c00fa2f01cd29e35712 Signed-off-by: Aleksander Mistewicz --- tsp/scripts/crawler.py | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py index 59436c8..e2e7d81 100755 --- a/tsp/scripts/crawler.py +++ b/tsp/scripts/crawler.py @@ -19,7 +19,7 @@ # @author Pawel Wieczorek import os -import urllib2 +import requests import time import argparse import logging @@ -35,25 +35,20 @@ def crawl(url): visited.add(url) - h = urllib2.build_opener() - h.addheaders = [('User-agent', 'Prerelease Crawler')] + headers = {'User-agent': 'Prerelease Crawler'} + r = requests.get(url, headers=headers) - try: - resp = h.open(url) - except urllib2.HTTPError as e: - print 'Failed to access {url}: {code} - {reason}'\ - .format(url=url, code=e.code, reason=e.reason) - - html = str(resp.read()) - soup = bs4.BeautifulSoup(html, 'lxml') - links = soup('a') + links = set() + soup = bs4.BeautifulSoup(r.text, 'html.parser') + for link in soup.find_all('a'): + links.add(link.get('href')) discovered = set() for link in links: if link not in discovered and link not in visited: - if link.string.startswith('tizen-common'): - logging.debug("Add link to discovered: %s", link['href']) - discovered.add(url + link['href']) + if link.startswith('tizen-common'): + logging.debug("Add link to discovered: %s", link) + discovered.add(url + link) return discovered @@ -68,6 +63,8 @@ def get_modified_paths(discovered, timestamp): else: return discovered logging.info("Previous timestamp: %s", stamp) + s = requests.Session() + s.headers.update({"If-Modified-Since": stamp}) for url in discovered: logging.debug("Check for MD5SUMS change: %s", url) md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\ @@ -78,15 +75,17 @@ def get_modified_paths(discovered, timestamp): url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS'] change = False for md5sums_url in md5sums_urls: - try: - u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp})) - except urllib2.HTTPError as e: - if e.code == 404: - logging.debug("MD5SUMS missing: %s", md5sums_url) - break - else: + r = s.get(md5sums_url) + if r.status_code == requests.codes.ok: logging.debug("MD5SUMS changed: %s", md5sums_url) change = True + elif r.status_code == 404: + logging.debug("MD5SUMS missing: %s", md5sums_url) + break + elif r.status_code == 304: + logging.debug("MD5SUMS unchanged: %s", md5sums_url) + else: + logging.warn("MD5SUMS error: %s", md5sums_url) if change: logging.info("Add to dispatch: %s", url) ret.add(url) -- 2.7.4