1 #!/usr/bin/env python3.8
2
3 import argparse
4 import os
5 import json
6
7 from typing import Dict, Any
8 from urllib.request import urlretrieve
9
10 argparser = argparse.ArgumentParser(
11 prog="download_pypi_packages",
12 description="Helper program to download PyPI packages",
13 )
14 argparser.add_argument(
15 "-n", "--number", type=int, default=100, help="Number of packages to download"
16 )
17 argparser.add_argument(
18 "-a", "--all", action="store_true", help="Download all packages listed in the json file"
19 )
20
21
22 def load_json(filename: str) -> Dict[Any, Any]:
23 with open(os.path.join("data", f"{filename}.json"), "r") as f:
24 j = json.loads(f.read())
25 return j
26
27
28 def remove_json(filename: str) -> None:
29 path = os.path.join("data", f"{filename}.json")
30 os.remove(path)
31
32
33 def download_package_json(package_name: str) -> None:
34 url = f"https://pypi.org/pypi/{package_name}/json"
35 urlretrieve(url, os.path.join("data", f"{package_name}.json"))
36
37
38 def download_package_code(name: str, package_json: Dict[Any, Any]) -> None:
39 source_index = -1
40 for idx, url_info in enumerate(package_json["urls"]):
41 if url_info["python_version"] == "source":
42 source_index = idx
43 break
44 filename = package_json["urls"][source_index]["filename"]
45 url = package_json["urls"][source_index]["url"]
46 urlretrieve(url, os.path.join("data", "pypi", filename))
47
48
49 def main() -> None:
50 args = argparser.parse_args()
51 number_packages = args.number
52 all_packages = args.all
53
54 top_pypi_packages = load_json("top-pypi-packages-365-days")
55 if all_packages:
56 top_pypi_packages = top_pypi_packages["rows"]
57 elif number_packages >= 0 and number_packages <= 4000:
58 top_pypi_packages = top_pypi_packages["rows"][:number_packages]
59 else:
60 raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")
61
62 try:
63 os.mkdir(os.path.join("data", "pypi"))
64 except FileExistsError:
65 pass
66
67 for package in top_pypi_packages:
68 package_name = package["project"]
69
70 print(f"Downloading JSON Data for {package_name}... ", end="")
71 download_package_json(package_name)
72 print("Done")
73
74 package_json = load_json(package_name)
75 try:
76 print(f"Downloading and compressing package {package_name} ... ", end="")
77 download_package_code(package_name, package_json)
78 print("Done")
79 except (IndexError, KeyError):
80 print(f"Could not locate source for {package_name}")
81 continue
82 finally:
83 remove_json(package_name)
84
85
86 if __name__ == "__main__":
87 main()