forked from IQTLabs/pypi-scan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapers.py
132 lines (104 loc) · 3.88 KB
/
scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""Perform webscraping related to typosquatting on PyPI.
A module that contains any functions that can make internet
calls to gather data related to typosquatting.
"""
import json
import sys
import urllib.request
from bs4 import BeautifulSoup
import requests
import jsontree
import constants
TOP_N = constants.TOP_N
def get_all_packages(page="https://pypi.org/simple/"):
"""Download simple list of PyPI package names.
pypi.org/simple conveniently lists all the names of current
packages. This function scrapes that listing and then places
the package names in a python list structure.
Args:
page (str): webpage from which to download pypi package names
Returns:
list: package names on pypi
"""
# Retrieve package name listing data from pypy
try:
pypi_package_page = requests.get(page)
except requests.exceptions.ConnectionError as e:
print("Internet connection issue. Check connection")
print(e)
sys.exit(1)
# Convert html to easily digestible format
soup = BeautifulSoup(pypi_package_page.text, "html.parser")
# Store package names in list
package_names = []
for elem in soup.find_all("a"): # Find all <a> tags
package_names.append(elem.string) # Get string inside a tag
# Return timestamp and package name list
return package_names
def get_top_packages(top_n=TOP_N, stored=False):
"""Identify top packages by download count on pypi.
A friendly person has already provided an occasionally
updated JSON feed to enable this program to build a list
of the top pypi packages by download count. The default
does a fresh pull of this feed. If the user wants to use
a stored list, that is possible if the user sets the stored
flag to true.
Args:
top_n (int): the number of top packages to retrieve
stored (bool): whether to use the stored package list
Returns:
dict: top packages
"""
if stored: # Get stored data
with open("top_packages_may_2020.json", "r") as f:
data = json.load(f)
else: # Get json data for top pypi packages from website
top_packages_url = (
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json"
)
# Catch if internet connectivity causes failure
try:
with urllib.request.urlopen(top_packages_url) as url: # nosec
data = json.loads(url.read().decode())
except urllib.error.URLError as e:
print("Internet connection issue. Check connection")
print(e)
sys.exit(1)
# Make JSON data easy to navigate
json_data = jsontree.jsontree(data)
# Place top_n packages in dict, where key is package
# name and value is rank
top_packages = {}
for i in range(0, top_n):
package_info = json_data.rows[i]
package_name = package_info["project"]
top_packages[package_name] = i + 1
return top_packages
def get_metadata(name):
"""Retrieve pypi package metadata for one package.
Retrieve via an internet call to PyPI via JSON metadata on a particular
PyPI package and return this information.
Args:
name (str): name of package on pypi for which to retrieve metadata
Returns:
dict: package metadata
"""
try:
# Make call to specified PyPI package via API endpoint
link = "https://pypi.org/pypi/" + name + "/json"
response = requests.get(link)
# Convert JSON to dict
metadata_dict = response.json()
except json.decoder.JSONDecodeError as e:
metadata_dict = {
"info": {
"author_email": "",
"author": "",
"package_url": "",
"description": "",
"home_page": "",
"summary": "",
}
}
# Return dict version
return metadata_dict