(root)/
Python-3.12.0/
Tools/
peg_generator/
scripts/
download_pypi_packages.py
       1  #!/usr/bin/env python3.8
       2  
       3  import argparse
       4  import os
       5  import json
       6  
       7  from typing import Dict, Any
       8  from urllib.request import urlretrieve
       9  
      10  argparser = argparse.ArgumentParser(
      11      prog="download_pypi_packages",
      12      description="Helper program to download PyPI packages",
      13  )
      14  argparser.add_argument(
      15      "-n", "--number", type=int, default=100, help="Number of packages to download"
      16  )
      17  argparser.add_argument(
      18      "-a", "--all", action="store_true", help="Download all packages listed in the json file"
      19  )
      20  
      21  
      22  def load_json(filename: str) -> Dict[Any, Any]:
      23      with open(os.path.join("data", f"{filename}.json"), "r") as f:
      24          j = json.loads(f.read())
      25      return j
      26  
      27  
      28  def remove_json(filename: str) -> None:
      29      path = os.path.join("data", f"{filename}.json")
      30      os.remove(path)
      31  
      32  
      33  def download_package_json(package_name: str) -> None:
      34      url = f"https://pypi.org/pypi/{package_name}/json"
      35      urlretrieve(url, os.path.join("data", f"{package_name}.json"))
      36  
      37  
      38  def download_package_code(name: str, package_json: Dict[Any, Any]) -> None:
      39      source_index = -1
      40      for idx, url_info in enumerate(package_json["urls"]):
      41          if url_info["python_version"] == "source":
      42              source_index = idx
      43              break
      44      filename = package_json["urls"][source_index]["filename"]
      45      url = package_json["urls"][source_index]["url"]
      46      urlretrieve(url, os.path.join("data", "pypi", filename))
      47  
      48  
      49  def main() -> None:
      50      args = argparser.parse_args()
      51      number_packages = args.number
      52      all_packages = args.all
      53  
      54      top_pypi_packages = load_json("top-pypi-packages-365-days")
      55      if all_packages:
      56          top_pypi_packages = top_pypi_packages["rows"]
      57      elif number_packages >= 0 and number_packages <= 4000:
      58          top_pypi_packages = top_pypi_packages["rows"][:number_packages]
      59      else:
      60          raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")
      61  
      62      try:
      63          os.mkdir(os.path.join("data", "pypi"))
      64      except FileExistsError:
      65          pass
      66  
      67      for package in top_pypi_packages:
      68          package_name = package["project"]
      69  
      70          print(f"Downloading JSON Data for {package_name}... ", end="")
      71          download_package_json(package_name)
      72          print("Done")
      73  
      74          package_json = load_json(package_name)
      75          try:
      76              print(f"Downloading and compressing package {package_name} ... ", end="")
      77              download_package_code(package_name, package_json)
      78              print("Done")
      79          except (IndexError, KeyError):
      80              print(f"Could not locate source for {package_name}")
      81              continue
      82          finally:
      83              remove_json(package_name)
      84  
      85  
      86  if __name__ == "__main__":
      87      main()