mirror of
https://github.com/opennetworkinglab/onos.git
synced 2025-10-16 01:41:26 +02:00
146 lines
6.0 KiB
Python
Executable File
146 lines
6.0 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
This is a scraper based series of scripts designed to retrieve the complete set
|
|
of deps of a given artifact. This script is brittle, it will USUALLY fetch the
|
|
set of dependencies however it may fail in certain cases, including on pages
|
|
with unusual url table structures.
|
|
|
|
In its current configuration it will list the dependencies that were officially
|
|
supported at release, it will not note any updates whatsoever.
|
|
"""
|
|
import urllib.request
|
|
import urllib.response
|
|
|
|
|
|
fringe = set()
|
|
already_checked = set()
|
|
human_readable_deps = set()
|
|
|
|
url_prefix = "https://mvnrepository.com/artifact/"
|
|
|
|
compile_deps_string = "<h2>Compile Dependencies ("
|
|
|
|
table_open_string = "<tbody"
|
|
table_close_string = "</tbody>"
|
|
|
|
table_row_open_string = "<tr>"
|
|
table_row_close_string = "</tr>"
|
|
|
|
table_row_cell_open_string = "<td"
|
|
table_row_cell_close_string = "</td>"
|
|
|
|
unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"
|
|
|
|
|
|
# url MUST be of the form https://mvnrepository.com/artifact/*organization_id*/*artifact_id*/*version_id*
|
|
# This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
|
|
def get_deps_for_artifact(url):
|
|
if url_prefix != url[:len(url_prefix)]:
|
|
raise ValueError("The url must begin with a valid https address for the maven central repository.")
|
|
stripped_url = url.replace(url_prefix, "")
|
|
fringe.add(stripped_url)
|
|
while len(fringe) != 0:
|
|
to_process = fringe.pop()
|
|
if to_process in already_checked:
|
|
continue
|
|
already_checked.add(to_process)
|
|
for dep in get_deps_from_page(url_prefix + to_process):
|
|
if dep not in already_checked and dep not in fringe:
|
|
fringe.add(dep)
|
|
return already_checked
|
|
|
|
|
|
# adds all dependencies on the specified page that have not been previously seen to the fringe
|
|
# page is expected to be a valid url
|
|
def get_deps_from_page(url):
|
|
page_deps = set()
|
|
# get the string version of this site
|
|
html_string = urllib.request.urlopen(url).read().decode()
|
|
|
|
# Determine how many compile deps
|
|
start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
|
|
end_index = html_string.find(")</h2>", start_index)
|
|
compile_deps_count = int(html_string[start_index: end_index])
|
|
# Get the compile deps if any
|
|
if compile_deps_count != 0:
|
|
table_open_index = html_string.find(table_open_string, end_index)
|
|
table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
|
|
compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
|
|
page_deps.update(compile_deps)
|
|
|
|
return page_deps
|
|
|
|
#Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
|
|
def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
|
|
table_deps = set()
|
|
start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)
|
|
|
|
while start_index != -1:
|
|
end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
|
|
+ len(table_row_close_string)
|
|
row_dep = get_dep_from_row(html_string, start_index, end_index)
|
|
if row_dep != None:
|
|
table_deps.add(row_dep)
|
|
start_index = html_string.find(table_row_open_string, end_index, table_close_index)
|
|
|
|
return table_deps
|
|
|
|
#The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
|
|
#"<td...></td>" tags, the fourth such pair will contain the desired information,
|
|
# the 5th will contain updated versions
|
|
def get_dep_from_row(html_string, row_start_index, row_end_index):
|
|
start_index = row_start_index
|
|
end_index = row_end_index
|
|
start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
|
|
end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
|
|
+ len(table_row_cell_close_string)
|
|
#set the indecies for the fourth "<td>" element
|
|
for i in range(3):
|
|
start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
|
|
end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
|
|
+ len(table_row_cell_close_string)
|
|
|
|
return get_dep_from_cell(html_string, start_index, end_index)
|
|
|
|
def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
|
|
start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
|
|
+ len(unique_identified_prefix)
|
|
end_index = html_string.find("\"", start_index, cell_end_index)
|
|
if start_index == (len(unique_identified_prefix) - 1):
|
|
return None
|
|
return html_string[start_index:end_index]
|
|
|
|
# Produces the dependency set but returns them in mvn coord style
|
|
def get_mvn_coordinates_deps(url):
|
|
mvn_coords = set()
|
|
for elem in get_deps_for_artifact(url):
|
|
artifact_start_index = elem.find("/", 0)
|
|
org_id = elem[:artifact_start_index]
|
|
version_start_index = elem.find("/", artifact_start_index + 1)
|
|
artifact_id = elem[artifact_start_index + 1: version_start_index]
|
|
version = elem[version_start_index + 1:]
|
|
mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)
|
|
|
|
return mvn_coords
|
|
|
|
|
|
# Prints out the complete set of deps for the specified package(s)
|
|
def print_collection(lst):
|
|
for elem in lst:
|
|
print(elem)
|
|
return
|
|
|
|
|
|
def main():
|
|
url = input("Please enter the url of the repo whose dependencies you would like?" +
|
|
"\n(this should be a fully qualified url\nex: https://mvnrepository." +
|
|
"com/artifact/com.google.guava/guava/19.0)")
|
|
mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
|
|
if mvn_coords == "y" or mvn_coords == "Y":
|
|
print_collection(get_mvn_coordinates_deps(url))
|
|
else:
|
|
print_collection(get_deps_for_artifact(url))
|
|
|
|
if __name__ == '__main__':
|
|
main()
|