Make crawler universal 20/108720/3
authorAleksander Mistewicz <a.mistewicz@samsung.com>
Fri, 4 Nov 2016 10:27:55 +0000 (11:27 +0100)
committerAleksander Mistewicz <a.mistewicz@samsung.com>
Mon, 16 Jan 2017 10:07:48 +0000 (11:07 +0100)
Directories, where MD5SUMS files should be, are now detected by crawl.
If MD5SUMS file is missing prerelease will not be dispatched.
If image directory is missing, prerelease will be dispatched.

Change-Id: I98088af7073ba9a8708b084f6974cbea18d000cd
Signed-off-by: Aleksander Mistewicz <a.mistewicz@samsung.com>
tsp/scripts/crawler.py

index e2e7d81..f2cc309 100755 (executable)
@@ -29,29 +29,45 @@ import bs4
 discovered_urls = 'modified_urls'
 dispatched_urls = 'dispatched_urls'
 
+def get_links(session, url):
+    main = session.get(url, timeout=120)
+    soup = bs4.BeautifulSoup(main.text, 'html.parser')
+    links = set()
+    for link in soup.find_all('a'):
+        links.add(link.get('href'))
+    return links
+
 def crawl(url):
     logging.info("crawl: %s", url)
     visited = set()
 
     visited.add(url)
 
-    headers = {'User-agent': 'Prerelease Crawler'}
-    r = requests.get(url, headers=headers)
-
-    links = set()
-    soup = bs4.BeautifulSoup(r.text, 'html.parser')
-    for link in soup.find_all('a'):
-        links.add(link.get('href'))
+    s = requests.Session()
+    s.headers.update({'User-agent': 'Prerelease Crawler'})
+    links = get_links(s, url)
 
     discovered = set()
     for link in links:
         if link not in discovered and link not in visited:
-            if link.startswith('tizen-common'):
+            if link.startswith('tizen-'):
                 logging.debug("Add link to discovered: %s", link)
                 discovered.add(url + link)
 
     return discovered
 
+def get_urls2check(session, md5sums, url, level=2):
+    links = get_links(session, url)
+    for link in links:
+        if not link.startswith("/") and link.endswith("/"):
+            if level > 0:
+                logging.debug("Step into: %s", link)
+                get_urls2check(session, md5sums, url + link, level - 1)
+            else:
+                logging.debug("Reached depth limit, ignore: %s", link)
+    if level == 0:
+        md5sums.add(url + "MD5SUMS")
+
 def get_modified_paths(discovered, timestamp):
     logging.info("get_modified_paths")
     ret = set()
@@ -67,12 +83,8 @@ def get_modified_paths(discovered, timestamp):
     s.headers.update({"If-Modified-Since": stamp})
     for url in discovered:
         logging.debug("Check for MD5SUMS change: %s", url)
-        md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
-            url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
-            url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS',\
-            url + 'images/x86_64-wayland/common-minimal-mbr-x86_64/MD5SUMS',\
-            url + 'images/arm-wayland/common-wayland-3parts-armv7l-artik/MD5SUMS',\
-            url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS']
+        md5sums_urls = set()
+        get_urls2check(s, md5sums_urls, url + "images/")
         change = False
         for md5sums_url in md5sums_urls:
             r = s.get(md5sums_url)