-
Notifications
You must be signed in to change notification settings - Fork 0
/
sloan_scrape.py
executable file
·91 lines (79 loc) · 2.61 KB
/
sloan_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
# NOTE: This doesn't match the data in data/sloan.js anymore. It will dump a json file for you, but
# the data used in the ipsum generator has been tidied further for capitalization, apostrophes,
# removal of some brackets, chaining of lines to remove orphans, and so on.
import json
import urllib2
import requests
from bs4 import BeautifulSoup
# manual list to ignore special releases, live albums etc (no repeat lyrics)
albums = [
'peppermint',
'smeared',
'twice-removed',
'one-chord-to-another',
'navy-blues',
'between-the-bridges',
'pretty-together',
'action-pact',
'a-sides-win-singles-1992-2005',
'never-hear-the-end-of-it',
'parallel-play',
'hit-run',
'the-double-cross'
# 'commonwealth'
]
data = []
url = 'http://www.sloanmusic.com/release/'
song_urls = []
def strip_for_urls(s):
return s
.replace(' ','-')
.replace(u'\u2019',"")
.replace(u'\u2032',"")
.replace('?','')
.replace('(','')
.replace(')','')
.replace(',','')
def strip_for_text(s):
return s
.lstrip()
.replace(u'\u2019',"'")
.replace(u'\u201c','"')
.replace(u'\u201d','"')
.replace(u'\u2032',"'")
.replace(u'\u2018',"'")
.replace(u'\u2013','-')
.replace(u'\u2014','-')
.replace(u'\u2026','')
.replace(u'\u62ae',"'r")
# first get a list of all the songs in each album
for a in albums:
print 'Reading',a,'...'
req = urllib2.Request(url+a, headers={'User-Agent' : 'friendly scraper, thank you!'})
soup = BeautifulSoup(urllib2.urlopen(req).read(), 'html.parser')
for song_list in soup.find('ol').find_all('a'):
songs = song_list.strings
for song in songs:
song = "".join(song).lower()
song = strip_for_urls(song)
if a == 'peppermint' and (song != 'torn' and song != 'lucky-for-me' and song != 'pretty-voice'):
pass
elif a == 'a-sides-win-singles-1992-2005' and (song != 'try-to-make-it' and song != 'all-used-up'):
pass
elif song == 'take-the-bench':
song_urls.append('the-the-bench') # sooooomebody made a typo on the sloan website
else:
song_urls.append(song)
# then loop through all the songs and grab the lyrics!
for song in song_urls:
print 'Scraping',song,'...'
req = urllib2.Request('http://www.sloanmusic.com/song/'+song, headers={ 'User-Agent': 'friendly scraper, thank you!'})
soup = BeautifulSoup(urllib2.urlopen(req).read(), 'html.parser')
for paragraph in soup.find('div', attrs={'class': 'entry-content'}).find_all('p'):
paragraph = paragraph.contents
for p in paragraph:
if p.find('<br') == -1:
p = strip_for_text(p).capitalize() # TODO: replace with non-destructive capitalization
data.append(p)
json.dump(data, open("data/sloan.json", 'w'), indent=1)