Fix superpmi.py/jitrollingbuild.py to handle large REST results (#53750)
authorBruce Forstall <brucefo@microsoft.com>
Mon, 7 Jun 2021 22:00:52 +0000 (15:00 -0700)
committerGitHub <noreply@github.com>
Mon, 7 Jun 2021 22:00:52 +0000 (15:00 -0700)
The Azure Storage REST API returns a maximum of 5000 results in
a single query. We have more results in the JIT rolling build set,
so we weren't finding queried git_hash values.

Generalize the query to loop using the "marker" continuation functionality.

We currently have about 800 superpmi collections results, but update
the superpmi.py script similarly for future proofing.

I also changed the query used by jitrollingbuild.py to specify a prefix,
to avoid requiring so many results. Probably should do the same thing for
SuperPMI collections. The downside is the REST api doesn't have a facility
for the prefix to be case-insensitive. I think it's ok to be case-sensitive,
but we'd need to verify that.

src/coreclr/scripts/jitrollingbuild.py
src/coreclr/scripts/superpmi.py

index 8343443..c5bf6d0 100644 (file)
@@ -172,11 +172,12 @@ def determine_jit_name(coreclr_args):
         raise RuntimeError("Unknown OS.")
 
 
-def list_az_jits(filter_func=lambda unused: True):
+def list_az_jits(filter_func=lambda unused: True, prefix_string = None):
     """ List the JITs in Azure Storage using REST api
 
     Args:
         filter_func (lambda: string -> bool): filter to apply to the list. The filter takes a URL and returns True if this URL is acceptable.
+        prefix_string: Optional. Specifies a string prefix for the Azure Storage query.
 
     Returns:
         urls (list): set of URLs in Azure Storage that match the filter.
@@ -187,46 +188,72 @@ def list_az_jits(filter_func=lambda unused: True):
 
     # This URI will return *all* the blobs, for all git_hash/OS/architecture/build_type combinations.
     # pass "prefix=foo/bar/..." to only show a subset. Or, we can filter later using string search.
-    list_az_container_uri = az_blob_storage_container_uri + "?restype=container&comp=list&prefix=" + az_builds_root_folder + "/"
-
-    try:
-        contents = urllib.request.urlopen(list_az_container_uri).read().decode('utf-8')
-    except Exception as exception:
-        print("Didn't find any collections using {}".format(list_az_container_uri))
-        print("  Error: {}".format(exception))
-        return None
-
-    # Contents is an XML file with contents like:
-    # <EnumerationResults ContainerName="https://clrjit2.blob.core.windows.net/jitrollingbuild">
-    #   <Prefix>builds/</Prefix>
-    #   <Blobs>
-    #     <Blob>
-    #       <Name>builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so</Name>
-    #       <Url>https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so</Url>
-    #       <Properties>
-    #         ...
-    #       </Properties>
-    #     </Blob>
-    #     <Blob>
-    #       <Name>builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib</Name>
-    #       <Url>https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib</Url>
-    #       <Properties>
-    #         ...
-    #       </Properties>
-    #     </Blob>
-    #     ... etc. ...
-    #   </Blobs>
-    # </EnumerationResults>
     #
-    # We just want to extract the <Url> entries. We could probably use an XML parsing package, but we just
-    # use regular expressions.
+    # Note that there is a maximum number of results returned in one query of 5000. So we might need to
+    # iterate. In that case, the XML result contains a `<NextMarker>` element like:
+    #
+    # <NextMarker>2!184!MDAwMDkyIWJ1aWxkcy8wMTZlYzI5OTAzMzkwMmY2ZTY4Yzg0YWMwYTNlYzkxN2Y5MzA0OTQ2L0xpbnV4L3g2NC9DaGVja2VkL2xpYmNscmppdF93aW5fYXJtNjRfeDY0LnNvITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ--</NextMarker>
+    #
+    # which we need to pass to the REST API with `marker=...`.
 
-    urls_split = contents.split("<Url>")[1:]
     urls = []
-    for item in urls_split:
-        url = item.split("</Url>")[0].strip()
-        if filter_func(url):
-            urls.append(url)
+
+    list_az_container_uri_root = az_blob_storage_container_uri + "?restype=container&comp=list&prefix=" + az_builds_root_folder + "/"
+    if prefix_string:
+        list_az_container_uri_root += prefix_string
+
+    iter = 1
+    marker = ""
+
+    while True:
+        list_az_container_uri = list_az_container_uri_root + marker
+
+        try:
+            contents = urllib.request.urlopen(list_az_container_uri).read().decode('utf-8')
+        except Exception as exception:
+            print("Didn't find any collections using {}".format(list_az_container_uri))
+            print("  Error: {}".format(exception))
+            return None
+
+        # Contents is an XML file with contents like:
+        # <EnumerationResults ContainerName="https://clrjit2.blob.core.windows.net/jitrollingbuild">
+        #   <Prefix>builds/</Prefix>
+        #   <Blobs>
+        #     <Blob>
+        #       <Name>builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so</Name>
+        #       <Url>https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/Linux/x64/Checked/libclrjit.so</Url>
+        #       <Properties>
+        #         ...
+        #       </Properties>
+        #     </Blob>
+        #     <Blob>
+        #       <Name>builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib</Name>
+        #       <Url>https://clrjit2.blob.core.windows.net/jitrollingbuild/builds/755f01659f03196487ec41225de8956911f8049b/OSX/x64/Checked/libclrjit.dylib</Url>
+        #       <Properties>
+        #         ...
+        #       </Properties>
+        #     </Blob>
+        #     ... etc. ...
+        #   </Blobs>
+        # </EnumerationResults>
+        #
+        # We just want to extract the <Url> entries. We could probably use an XML parsing package, but we just
+        # use regular expressions.
+
+        urls_split = contents.split("<Url>")[1:]
+        for item in urls_split:
+            url = item.split("</Url>")[0].strip()
+            if filter_func(url):
+                urls.append(url)
+
+        # Look for a continuation marker.
+        re_match = re.match(r'.*<NextMarker>(.*)</NextMarker>.*', contents)
+        if re_match:
+            marker_text = re_match.group(1)
+            marker = "&marker=" + marker_text
+            iter += 1
+        else:
+            break
 
     return urls
 
@@ -449,7 +476,7 @@ def get_jit_urls(coreclr_args, find_all=False):
         url = url.lower()
         return find_all or url.startswith(blob_prefix_filter)
 
-    return list_az_jits(filter_jits)
+    return list_az_jits(filter_jits, None if find_all else blob_filter_string)
 
 
 def download_command(coreclr_args):
index 37f069d..449c8d2 100755 (executable)
@@ -2407,45 +2407,69 @@ def list_superpmi_collections_container_via_rest_api(path_filter=lambda unused:
 
     # This URI will return *all* the blobs, for all jit-ee-version/OS/architecture combinations.
     # pass "prefix=foo/bar/..." to only show a subset. Or, we can filter later using string search.
-    list_superpmi_container_uri = az_blob_storage_superpmi_container_uri + "?restype=container&comp=list&prefix=" + az_collections_root_folder + "/"
-
-    try:
-        contents = urllib.request.urlopen(list_superpmi_container_uri).read().decode('utf-8')
-    except Exception as exception:
-        logging.error("Didn't find any collections using %s", list_superpmi_container_uri)
-        logging.error("  Error: %s", exception)
-        return None
-
-    # Contents is an XML file with contents like:
     #
-    # <EnumerationResults ContainerName="https://clrjit.blob.core.windows.net/superpmi/collections">
-    #   <Blobs>
-    #     <Blob>
-    #       <Name>jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip</Name>
-    #       <Url>https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip</Url>
-    #       <Properties>
-    #         ...
-    #       </Properties>
-    #     </Blob>
-    #     <Blob>
-    #       <Name>jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip</Name>
-    #       <Url>https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip</Url>
-    #     ... etc. ...
-    #   </Blobs>
-    # </EnumerationResults>
+    # Note that there is a maximum number of results returned in one query of 5000. So we might need to
+    # iterate. In that case, the XML result contains a `<NextMarker>` element like:
     #
-    # We just want to extract the <Url> entries. We could probably use an XML parsing package, but we just
-    # use regular expressions.
-
-    url_prefix = az_blob_storage_superpmi_container_uri + "/" + az_collections_root_folder + "/"
+    # <NextMarker>2!184!MDAwMDkyIWJ1aWxkcy8wMTZlYzI5OTAzMzkwMmY2ZTY4Yzg0YWMwYTNlYzkxN2Y5MzA0OTQ2L0xpbnV4L3g2NC9DaGVja2VkL2xpYmNscmppdF93aW5fYXJtNjRfeDY0LnNvITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ--</NextMarker>
+    #
+    # which we need to pass to the REST API with `marker=...`.
 
-    urls_split = contents.split("<Url>")[1:]
     paths = []
-    for item in urls_split:
-        url = item.split("</Url>")[0].strip()
-        path = remove_prefix(url, url_prefix)
-        if path_filter(path):
-            paths.append(path)
+
+    list_superpmi_container_uri_base = az_blob_storage_superpmi_container_uri + "?restype=container&comp=list&prefix=" + az_collections_root_folder + "/"
+
+    iter = 1
+    marker = ""
+
+    while True:
+        list_superpmi_container_uri = list_superpmi_container_uri_base + marker
+
+        try:
+            contents = urllib.request.urlopen(list_superpmi_container_uri).read().decode('utf-8')
+        except Exception as exception:
+            logging.error("Didn't find any collections using %s", list_superpmi_container_uri)
+            logging.error("  Error: %s", exception)
+            return None
+
+        # Contents is an XML file with contents like:
+        #
+        # <EnumerationResults ContainerName="https://clrjit.blob.core.windows.net/superpmi/collections">
+        #   <Blobs>
+        #     <Blob>
+        #       <Name>jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip</Name>
+        #       <Url>https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.frameworks.mch.zip</Url>
+        #       <Properties>
+        #         ...
+        #       </Properties>
+        #     </Blob>
+        #     <Blob>
+        #       <Name>jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip</Name>
+        #       <Url>https://clrjit.blob.core.windows.net/superpmi/collections/jit-ee-guid/Linux/x64/Linux.x64.Checked.mch.zip</Url>
+        #     ... etc. ...
+        #   </Blobs>
+        # </EnumerationResults>
+        #
+        # We just want to extract the <Url> entries. We could probably use an XML parsing package, but we just
+        # use regular expressions.
+
+        url_prefix = az_blob_storage_superpmi_container_uri + "/" + az_collections_root_folder + "/"
+
+        urls_split = contents.split("<Url>")[1:]
+        for item in urls_split:
+            url = item.split("</Url>")[0].strip()
+            path = remove_prefix(url, url_prefix)
+            if path_filter(path):
+                paths.append(path)
+
+        # Look for a continuation marker.
+        re_match = re.match(r'.*<NextMarker>(.*)</NextMarker>.*', contents)
+        if re_match:
+            marker_text = re_match.group(1)
+            marker = "&marker=" + marker_text
+            iter += 1
+        else:
+            break
 
     return paths