From 8da44474f478b05f36e158bd3990eb5da4cc6f9c Mon Sep 17 00:00:00 2001 From: Aleksander Mistewicz Date: Fri, 4 Nov 2016 11:27:55 +0100 Subject: [PATCH] Make crawler universal Directories, where MD5SUMS files should be, are now detected by crawl. If MD5SUMS file is missing prerelease will not be dispatched. If image directory is missing, prerelease will be dispatched. Change-Id: I98088af7073ba9a8708b084f6974cbea18d000cd Signed-off-by: Aleksander Mistewicz --- tsp/scripts/crawler.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py index e2e7d81..f2cc309 100755 --- a/tsp/scripts/crawler.py +++ b/tsp/scripts/crawler.py @@ -29,29 +29,45 @@ import bs4 discovered_urls = 'modified_urls' dispatched_urls = 'dispatched_urls' +def get_links(session, url): + main = session.get(url, timeout=120) + soup = bs4.BeautifulSoup(main.text, 'html.parser') + links = set() + for link in soup.find_all('a'): + links.add(link.get('href')) + return links + def crawl(url): logging.info("crawl: %s", url) visited = set() visited.add(url) - headers = {'User-agent': 'Prerelease Crawler'} - r = requests.get(url, headers=headers) - - links = set() - soup = bs4.BeautifulSoup(r.text, 'html.parser') - for link in soup.find_all('a'): - links.add(link.get('href')) + s = requests.Session() + s.headers.update({'User-agent': 'Prerelease Crawler'}) + links = get_links(s, url) discovered = set() for link in links: if link not in discovered and link not in visited: - if link.startswith('tizen-common'): + if link.startswith('tizen-'): logging.debug("Add link to discovered: %s", link) discovered.add(url + link) return discovered +def get_urls2check(session, md5sums, url, level=2): + links = get_links(session, url) + for link in links: + if not link.startswith("/") and link.endswith("/"): + if level > 0: + logging.debug("Step into: %s", link) + get_urls2check(session, md5sums, url + link, level - 1) + else: + logging.debug("Reached depth limit, ignore: %s", link) + if level == 0: + md5sums.add(url + "MD5SUMS") + def get_modified_paths(discovered, timestamp): logging.info("get_modified_paths") ret = set() @@ -67,12 +83,8 @@ def get_modified_paths(discovered, timestamp): s.headers.update({"If-Modified-Since": stamp}) for url in discovered: logging.debug("Check for MD5SUMS change: %s", url) - md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\ - url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\ - url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS',\ - url + 'images/x86_64-wayland/common-minimal-mbr-x86_64/MD5SUMS',\ - url + 'images/arm-wayland/common-wayland-3parts-armv7l-artik/MD5SUMS',\ - url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS'] + md5sums_urls = set() + get_urls2check(s, md5sums_urls, url + "images/") change = False for md5sums_url in md5sums_urls: r = s.get(md5sums_url) -- 2.7.4