-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
190 lines (169 loc) · 6.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import csv
import json
import logging
import os
import requests
from bs4 import BeautifulSoup
from huggingface_hub import hf_api
# Logger configuration for having neat logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
BASE_URL = "https://huggingface.co"
# The third list is the file structure of the project on HuggingFace
LIST_INDEX = 2
# This text belongs only to files on HuggingFace
FILE_IDENTIFIER = "Download file"
# The precision of space size
DIGIT_PRECISION = 1
def is_dir(title: str) -> bool:
"""
Determines whether the list item is a directory or file as HuggingFace tags files by their size
:param title:
:return: whether is dir or file
"""
if "kB" in title or "Bytes" in title or "MB" in title or "GB" in title:
return False
return True
def update_csv(csv_file, row):
"""
Updates the specified csv file by adding a row at the end of it
:param csv_file:
:param row: given record to add
"""
with open(csv_file, mode="a", newline="") as outfile:
writer = csv.writer(outfile)
if outfile.tell() == 0:
writer.writerow(["category", "model", "space", "size"])
writer.writerow(row)
outfile.flush()
def convert_size(size_text: str) -> int:
"""
Converts the file size into bytes
:param size_text: the raw size of the file
:return: size as an integer
"""
if "kB" in size_text:
return int(float(size_text.split(" kB")[0]) * 1024)
if "MB" in size_text:
return int(float(size_text.split(" MB")[0]) * 1024 * 1024)
if "GB" in size_text:
return int(float(size_text.split(" GB")[0]) * 1024 * 1024 * 1024)
elif "Bytes" in size_text:
return int(size_text.split(" Bytes")[0])
return 0
def soup_crawl(url: str) -> int:
"""
Opens the specified url and parses the content to find the files and dirs list items, and obtains the size
of each directory's files recursively.
:param url: The page that can be the root of the project or an inner directory
:return: The total size of a directory
"""
size = 0
try:
content = requests.get(f"{BASE_URL}{url}")
soup = BeautifulSoup(content.text, "html.parser")
i = 0
for tag in soup.findAll("ul"):
if i < LIST_INDEX:
i += 1
continue
for t in tag.findAll("li"):
a_tag = t.find("a")
if a_tag:
href_value = a_tag.get("href")
a_tag = t.find("a", title=FILE_IDENTIFIER)
if a_tag:
size_text = a_tag.get_text(strip=True)
converted_size = convert_size(size_text)
size += converted_size
else:
size += soup_crawl(href_value)
except Exception as e:
logger.error(e)
return size
def crawl_spaces(category: str, number=20):
"""
Fetches the spaces of every model in the models file, and calls the crawler to obtain the size of each space to
store in the specified file.
:param category: The pipeline tag
:param number: Specifies the source models file
"""
input_file_path = f"models-{category}-{number}.csv"
result_file_path = f"{category}.csv"
current_spaces = set()
if os.path.exists(result_file_path):
with open(result_file_path, mode="r") as infile:
reader = csv.reader(infile)
next(reader)
for row in reader:
space = row[2]
current_spaces.add(space)
with open(input_file_path, mode="r") as infile:
reader = csv.reader(infile)
next(reader)
for row in reader:
model_id = row[1]
logger.info("MODEL:{}".format(model_id))
spaces = list(hf_api.list_spaces(models=model_id))
for index, space in enumerate(spaces):
if space.id in current_spaces:
continue
current_spaces.add(space.id)
logger.info(space.id)
size = soup_crawl(f"/spaces/{space.id}/tree/main")
logger.info(f"Space: {space.id}, size: {size}")
update_csv(
csv_file=result_file_path,
row=[
category,
model_id,
space.id,
round(float(size) / 1024.0, ndigits=DIGIT_PRECISION),
],
)
def crawl_models(category: str, sort="downloads", number=20):
"""
Fetches the top models of the specified category with the number of applications. Then, stores the result in
the models file
:param category: Pipeline tag
:param sort: Whether based on most-downloads, like, or HuggingFace acceptable tag to sort.
:param number: The number of results
"""
models_file = f"models-{category}-{number}.csv"
model_dict = {}
top_models = requests.get(
f"{BASE_URL}/api/models",
params={
"pipeline_tag": category,
"sort": sort,
"direction": -1,
"limit": number,
},
)
prettified_models = json.loads(top_models.text)
for model in prettified_models:
spaces = list(hf_api.list_spaces(models=model["id"]))
model_dict[model["id"]] = len(spaces)
cnt = 0
with open(models_file, mode="w") as outfile:
writer = csv.writer(outfile)
if outfile.tell() == 0:
writer.writerow(["category", "model", "number_of_apps"])
for k, v in model_dict.items():
writer.writerow([category, k, v])
cnt += v
outfile.flush()
logger.info(f"Category: {category}, Sum: {cnt}")
if __name__ == "__main__":
logger.info("Starting the app...")
crawl_models(category="text-classification")
# crawl_models(category="text-generation")
# crawl_spaces(category="text-classification")
# crawl_spaces(category="text-generation")