-
Notifications
You must be signed in to change notification settings - Fork 1
/
teams-scraper.py
46 lines (36 loc) · 1.44 KB
/
teams-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
from bs4 import BeautifulSoup
import csv
"""
Scrape the wiki page to write csv files with teams names for all leagues we are interested in
Final step would be to combine all the csv files into the teams.csv
TODO: Loop through all the tables and filter the tables we are interested in
"""
if __name__ == '__main__':
SCRAPE_URL = 'https://en.wikipedia.org/wiki/List_of_professional_sports_teams_in_the_United_States_and_Canada'
# Get the html content
html_content = requests.get(SCRAPE_URL).text
soup = BeautifulSoup(html_content, 'html.parser')
# Get the table content
gdp_table = soup.find("table", attrs={"class": "wikitable"})
gdp_table_data = gdp_table.tbody.find_all("tr")
# Get all the headers
headers = []
for th in gdp_table_data[0].find_all("th"):
headers.append(th.text.replace('\n', '').strip())
print(headers)
# Get all the rows
teams = []
for tr in gdp_table_data:
table_row = []
for td in tr.find_all("td"):
table_row.append(td.text.replace('\n', '').strip())
if len(table_row) > 0:
teams.append(table_row[0])
print(teams)
# Write to CSV file
with open("data/mlb-teams.csv", 'w') as out_file:
out_writer = csv.writer(out_file, delimiter=',')
for team in teams:
data = team.rsplit(' ', 1)
out_writer.writerow([data[0].lower(), data[1].lower(), 'baseball', 'm', 'mlb'])