#!/usr/bin/env python """ This is a scraper based series of scripts designed to retrieve the complete set of deps of a given artifact. This script is brittle, it will USUALLY fetch the set of dependencies however it may fail in certain cases, including on pages with unusual url table structures. In its current configuration it will list the dependencies that were officially supported at release, it will not note any updates whatsoever. """ import urllib.request import urllib.response fringe = set() already_checked = set() human_readable_deps = set() url_prefix = "https://mvnrepository.com/artifact/" compile_deps_string = "

Compile Dependencies (" table_open_string = "", start_index) compile_deps_count = int(html_string[start_index: end_index]) # Get the compile deps if any if compile_deps_count != 0: table_open_index = html_string.find(table_open_string, end_index) table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string) compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count) page_deps.update(compile_deps) return page_deps #Table is expected to be from the "" tag to the "" tag. def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count): table_deps = set() start_index = html_string.find(table_row_open_string, table_open_index, table_close_index) while start_index != -1: end_index = html_string.find(table_row_close_string, start_index, table_close_index)\ + len(table_row_close_string) row_dep = get_dep_from_row(html_string, start_index, end_index) if row_dep != None: table_deps.add(row_dep) start_index = html_string.find(table_row_open_string, end_index, table_close_index) return table_deps #The row is expected to be from "" tag, it is expected to contain 5 pairs of matched #"" tags, the fourth such pair will contain the desired information, # the 5th will contain updated versions def get_dep_from_row(html_string, row_start_index, row_end_index): start_index = row_start_index end_index = row_end_index start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index) end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\ + len(table_row_cell_close_string) #set the indecies for the fourth "" element for i in range(3): start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index) end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\ + len(table_row_cell_close_string) return get_dep_from_cell(html_string, start_index, end_index) def get_dep_from_cell(html_string, cell_start_index, cell_end_index): start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\ + len(unique_identified_prefix) end_index = html_string.find("\"", start_index, cell_end_index) if start_index == (len(unique_identified_prefix) - 1): return None return html_string[start_index:end_index] # Produces the dependency set but returns them in mvn coord style def get_mvn_coordinates_deps(url): mvn_coords = set() for elem in get_deps_for_artifact(url): artifact_start_index = elem.find("/", 0) org_id = elem[:artifact_start_index] version_start_index = elem.find("/", artifact_start_index + 1) artifact_id = elem[artifact_start_index + 1: version_start_index] version = elem[version_start_index + 1:] mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version) return mvn_coords # Prints out the complete set of deps for the specified package(s) def print_collection(lst): for elem in lst: print(elem) return def main(): url = input("Please enter the url of the repo whose dependencies you would like?" + "\n(this should be a fully qualified url\nex: https://mvnrepository." + "com/artifact/com.google.guava/guava/19.0)") mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)") if mvn_coords == "y" or mvn_coords == "Y": print_collection(get_mvn_coordinates_deps(url)) else: print_collection(get_deps_for_artifact(url)) if __name__ == '__main__': main()