From 2dd402dc8375606a4db06256e9b194b4dc1aedf1 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Mon, 7 Jun 2021 15:00:52 -0700 Subject: [PATCH] Fix superpmi.py/jitrollingbuild.py to handle large REST results (#53750) The Azure Storage REST API returns a maximum of 5000 results in a single query. We have more results in the JIT rolling build set, so we weren't finding queried git_hash values. Generalize the query to loop using the "marker" continuation functionality. We currently have about 800 superpmi collections results, but update the superpmi.py script similarly for future proofing. I also changed the query used by jitrollingbuild.py to specify a prefix, to avoid requiring so many results. Probably should do the same thing for SuperPMI collections. The downside is the REST api doesn't have a facility for the prefix to be case-insensitive. I think it's ok to be case-sensitive, but we'd need to verify that. --- src/coreclr/scripts/jitrollingbuild.py | 105 +++++++++++++++++++++------------ src/coreclr/scripts/superpmi.py | 94 ++++++++++++++++++----------- 2 files changed, 125 insertions(+), 74 deletions(-) diff --git a/src/coreclr/scripts/jitrollingbuild.py b/src/coreclr/scripts/jitrollingbuild.py index 8343443..c5bf6d0 100644 --- a/src/coreclr/scripts/jitrollingbuild.py +++ b/src/coreclr/scripts/jitrollingbuild.py @@ -172,11 +172,12 @@ def determine_jit_name(coreclr_args): raise RuntimeError("Unknown OS.") -def list_az_jits(filter_func=lambda unused: True): +def list_az_jits(filter_func=lambda unused: True, prefix_string = None): """ List the JITs in Azure Storage using REST api Args: filter_func (lambda: string -> bool): filter to apply to the list. The filter takes a URL and returns True if this URL is acceptable. + prefix_string: Optional. Specifies a string prefix for the Azure Storage query. Returns: urls (list): set of URLs in Azure Storage that match the filter. @@ -187,46 +188,72 @@ def list_az_jits(filter_func=lambda unused: True): # This URI will return *all* the blobs, for all git_hash/OS/architecture/build_type combinations. # pass "prefix=foo/bar/..." to only show a subset. Or, we can filter later using string search. - list_az_container_uri = az_blob_storage_container_uri + "?restype=container&comp=list&prefix=" + az_builds_root_folder + "/" - - try: - contents = urllib.request.urlopen(list_az_container_uri).read().decode('utf-8') - except Exception as exception: - print("Didn't find any collections using {}".format(list_az_container_uri)) - print(" Error: {}".format(exception)) - return None - - # Contents is an XML file with contents like: - # - # builds/ - # - # - # builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so - # https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so - # - # ... - # - # - # - # builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib - # https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib - # - # ... - # - # - # ... etc. ... - # - # # - # We just want to extract the entries. We could probably use an XML parsing package, but we just - # use regular expressions. + # Note that there is a maximum number of results returned in one query of 5000. So we might need to + # iterate. In that case, the XML result contains a `` element like: + # + # 2!184!MDAwMDkyIWJ1aWxkcy8wMTZlYzI5OTAzMzkwMmY2ZTY4Yzg0YWMwYTNlYzkxN2Y5MzA0OTQ2L0xpbnV4L3g2NC9DaGVja2VkL2xpYmNscmppdF93aW5fYXJtNjRfeDY0LnNvITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ-- + # + # which we need to pass to the REST API with `marker=...`. - urls_split = contents.split("")[1:] urls = [] - for item in urls_split: - url = item.split("")[0].strip() - if filter_func(url): - urls.append(url) + + list_az_container_uri_root = az_blob_storage_container_uri + "?restype=container&comp=list&prefix=" + az_builds_root_folder + "/" + if prefix_string: + list_az_container_uri_root += prefix_string + + iter = 1 + marker = "" + + while True: + list_az_container_uri = list_az_container_uri_root + marker + + try: + contents = urllib.request.urlopen(list_az_container_uri).read().decode('utf-8') + except Exception as exception: + print("Didn't find any collections using {}".format(list_az_container_uri)) + print(" Error: {}".format(exception)) + return None + + # Contents is an XML file with contents like: + # + # builds/ + # + # + # builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so + # https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so + # + # ... + # + # + # + # builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib + # https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib + # + # ... + # + # + # ... etc. ... + # + # + # + # We just want to extract the entries. We could probably use an XML parsing package, but we just + # use regular expressions. + + urls_split = contents.split("")[1:] + for item in urls_split: + url = item.split("")[0].strip() + if filter_func(url): + urls.append(url) + + # Look for a continuation marker. + re_match = re.match(r'.*(.*).*', contents) + if re_match: + marker_text = re_match.group(1) + marker = "&marker=" + marker_text + iter += 1 + else: + break return urls @@ -449,7 +476,7 @@ def get_jit_urls(coreclr_args, find_all=False): url = url.lower() return find_all or url.startswith(blob_prefix_filter) - return list_az_jits(filter_jits) + return list_az_jits(filter_jits, None if find_all else blob_filter_string) def download_command(coreclr_args): diff --git a/src/coreclr/scripts/superpmi.py b/src/coreclr/scripts/superpmi.py index 37f069d..449c8d2 100755 --- a/src/coreclr/scripts/superpmi.py +++ b/src/coreclr/scripts/superpmi.py @@ -2407,45 +2407,69 @@ def list_superpmi_collections_container_via_rest_api(path_filter=lambda unused: # This URI will return *all* the blobs, for all jit-ee-version/OS/architecture combinations. # pass "prefix=foo/bar/..." to only show a subset. Or, we can filter later using string search. - list_superpmi_container_uri = az_blob_storage_superpmi_container_uri + "?restype=container&comp=list&prefix=" + az_collections_root_folder + "/" - - try: - contents = urllib.request.urlopen(list_superpmi_container_uri).read().decode('utf-8') - except Exception as exception: - logging.error("Didn't find any collections using %s", list_superpmi_container_uri) - logging.error(" Error: %s", exception) - return None - - # Contents is an XML file with contents like: # - # - # - # - # jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip - # https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip - # - # ... - # - # - # - # jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip - # https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip - # ... etc. ... - # - # + # Note that there is a maximum number of results returned in one query of 5000. So we might need to + # iterate. In that case, the XML result contains a `` element like: # - # We just want to extract the entries. We could probably use an XML parsing package, but we just - # use regular expressions. - - url_prefix = az_blob_storage_superpmi_container_uri + "/" + az_collections_root_folder + "/" + # 2!184!MDAwMDkyIWJ1aWxkcy8wMTZlYzI5OTAzMzkwMmY2ZTY4Yzg0YWMwYTNlYzkxN2Y5MzA0OTQ2L0xpbnV4L3g2NC9DaGVja2VkL2xpYmNscmppdF93aW5fYXJtNjRfeDY0LnNvITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ-- + # + # which we need to pass to the REST API with `marker=...`. - urls_split = contents.split("")[1:] paths = [] - for item in urls_split: - url = item.split("")[0].strip() - path = remove_prefix(url, url_prefix) - if path_filter(path): - paths.append(path) + + list_superpmi_container_uri_base = az_blob_storage_superpmi_container_uri + "?restype=container&comp=list&prefix=" + az_collections_root_folder + "/" + + iter = 1 + marker = "" + + while True: + list_superpmi_container_uri = list_superpmi_container_uri_base + marker + + try: + contents = urllib.request.urlopen(list_superpmi_container_uri).read().decode('utf-8') + except Exception as exception: + logging.error("Didn't find any collections using %s", list_superpmi_container_uri) + logging.error(" Error: %s", exception) + return None + + # Contents is an XML file with contents like: + # + # + # + # + # jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip + # https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip + # + # ... + # + # + # + # jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip + # https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip + # ... etc. ... + # + # + # + # We just want to extract the entries. We could probably use an XML parsing package, but we just + # use regular expressions. + + url_prefix = az_blob_storage_superpmi_container_uri + "/" + az_collections_root_folder + "/" + + urls_split = contents.split("")[1:] + for item in urls_split: + url = item.split("")[0].strip() + path = remove_prefix(url, url_prefix) + if path_filter(path): + paths.append(path) + + # Look for a continuation marker. + re_match = re.match(r'.*(.*).*', contents) + if re_match: + marker_text = re_match.group(1) + marker = "&marker=" + marker_text + iter += 1 + else: + break return paths -- 2.7.4