-
Notifications
You must be signed in to change notification settings - Fork 7
/
nps_photo_retrieve.py
executable file
·80 lines (68 loc) · 2.27 KB
/
nps_photo_retrieve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
import urllib2
nps_site = "http://www.nps.gov/features/yell/slidefile/"
imageDirectory = "data/nps/"
# Get the index page of a URL
def getPage(url):
return url.rfind('/') + 1
# Return list of URLs
def readPageForURLs(site, html):
urls = []
start = 0
urlPos = html.find("<a ")
while urlPos >= 0:
startIndex = html.find('"', urlPos)
endIndex = html.find('"', startIndex + 1)
# Only include relative URLs. Also, exluce images
if html[startIndex + 1:startIndex + 5] != 'http' and \
html[startIndex + 1:startIndex + 7] != 'Images':
urls.append(site + html[startIndex + 1:endIndex])
urlPos = html.find("<a ", urlPos + 1)
return urls
# Return list of image links
def readPageForIMGs(site, html):
imgs = []
start = 0
imgPos = html.find("Images/")
while imgPos >= 0:
endIndex = html.find('"', imgPos)
imgs.append(site + html[imgPos:endIndex])
imgPos = html.find("Images/", imgPos + 1)
return imgs
# Recursively search the given site for images.
def retrieveImages(site, page, foundSites, images):
if site + page in foundSites:
return
foundSites.add(site + page)
try:
response = urllib2.urlopen(site + page)
html = response.read()
# Read the urls and the images of the page
newURLs = readPageForURLs(site, html)
newIMGs = readPageForIMGs(site, html)
images.extend(newIMGs)
# Recursively search the new URLs
for newURL in newURLs:
pageIndex = getPage(newURL)
newSite = newURL[:pageIndex]
newPage = newURL[pageIndex:]
retrieveImages(newSite, newPage, foundSites, images)
except:
print 'Could not read:', site + page
# Downloads the image with this URL.
def downloadImage(image):
try:
response = urllib2.urlopen(image)
binary = response.read()
imageIndex = getPage(image)
outputFile = image[imageIndex:]
f = open(imageDirectory + outputFile, 'w')
f.write(binary)
f.close()
except:
print 'Could not read image:', image
images = []
retrieveImages(nps_site, "", set(), images)
print "Found", len(images), "images"
for image in images:
downloadImage(image)