forked from chrishoffman/smugpy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
keyword-fixer.py
executable file
·145 lines (128 loc) · 4.9 KB
/
keyword-fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
import logging
import webapp2
from mylib.smugpy import SmugMug
import re
import time
API_KEY = "yTTjttx2pkK1QSTIPS4geSsTzXZqtdxp"
OAUTH_SECRET = "786c1e774f5d6c5dda7e92bdf7d4b30c"
# smugmug = SmugMug(api_key=API_KEY, oauth_secret=OAUTH_SECRET,
# app_name="keyword-fixer")
# #Oauth handshake
# smugmug.auth_getRequestToken()
# raw_input("Authorize app at %s\n\nPress Enter when complete.\n" % (smugmug.authorize()))
# smugmug.auth_getAccessToken()
# Basic flow:
# (1) Download keyword strings for images in albums matching condition
# Input: Condition to filter albums: regex, LastUpdated, ??
# Output: file mapping imageID to keyword string
# (2) Analyze keyword strings, suggest remapping
# Input: file mapping imageID to keyword string
# Output: file listing each keyword, its frequency, and suggested mapping
# e.g. "cardbaord" x 3 -> "cardboard"
# (3) Remap keywords
# Input: Output files from previous two steps, possibly after editing
# Output: New file mapping imageID to keyword string
# Q: Should file include both before and after keywords?
# Q: Should it include unchanged keywords?
# (4) Upload keyword strings.
# Input: File mapping imageID to keyword string
#
# Basic data structure:
# keyinfo["images"][imageID]["old"] : original keywork string
# ["new"] : updated keywork string
# ["key"] : Image key
# ["keywords"][keyword]["cnt"] : Frequency
#
# Better data structure:
# Struct Album:
# Id:
# Key:
# Images: List of Image structs
#
# Struct Image:
# Id:
# Key:
# OrigKeywordString:
# OrigKeywords: list()
# Keywords: list() of proposed new keywords for image
#
# Struct Keyword:
# string Word:
# int Count:
# list Variations: typographically close variations, e.g. 1 letter
# dropped, two letters swapped
#
# Generate map of each keyword with each letter removed
# Cross reference this list to identify letter swaps. (how??)
# See http://norvig.com/spell-correct.html
# read and report stuff
def get_kw():
smugmug = SmugMug( api_key=API_KEY, api_version="1.3.0",
app_name="keyword-fixer")
retval = "";
acnt = 0
keywords = dict()
months_ago = 3
last_updated = int(time.time() - 3600*24*30*months_ago);
albums = smugmug.albums_get(NickName="brettcoon",LastUpdated=last_updated)
for album in albums["Albums"]:
acnt = acnt + 1
logging.info("Reading album #%d", acnt)
if acnt > 5:
break
albumID = album["id"]
albumKey = album["Key"]
image_list = smugmug.images_get(AlbumID=album["id"],
AlbumKey=album["Key"],
LastUpdated=last_updated)
# for k in image_list["Album"].keys():
# print "%s, %s has key=%s" % (album["id"], album["Title"], k)
retval += "%s, %s has %d images\n" % (album["id"], album["Title"],
image_list["Album"]["ImageCount"])
re_keydiv = re.compile(',\s*')
for image in image_list["Album"]["Images"]:
imId = image["id"]
imKey = image["Key"]
imInfo = smugmug.images_getInfo(ImageID=imId,ImageKey=imKey)
# print "\nimInfo start:"
# print imInfo
# print "imInfo end.\n"
kwords = imInfo["Image"]["Keywords"]
retval += " image id=%s key=%s kwords='%s'\n" % (imId,imKey,kwords)
for kw in re_keydiv.split(kwords):
# Update count for this keyword
if kw not in keywords:
keywords[kw] = dict()
keywords[kw]["cnt"] = 0
keywords[kw]["images"] = list()
keywords[kw]["cnt"] = 1 + keywords[kw]["cnt"]
keywords[kw]["images"].append((imId,imKey))
re_numonly = re.compile('^\d+$')
re_nummostly = re.compile('\d{5}')
re_nonums = re.compile('^\D+$')
re_inside_space = re.compile('^\S.*\s.*\S')
re_suspicious = re.compile('(^\d+$)|\W|\d{4}|(^$)')
for kw in keywords:
retval += "Keyword: %-30s Cnt: %3d " % (kw,keywords[kw]["cnt"])
if re_inside_space.search(kw):
retval += " probably needs to be split better"
elif re_numonly.search(kw):
retval += " is BAD"
elif re_suspicious.search(kw):
retval += " is really SUSPICIOUS"
elif re_nummostly.search(kw):
retval += " is SUSPICIOUS"
retval += "\n"
return retval
class MainPage(webapp2.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/plain'
self.response.write('Hello, silly webapp2 person!')
self.response.write( "\nGoing to call my code now.\n" )
rslt = get_kw()
# print rslt
self.response.write( rslt )
self.response.write( "\nAnd that's all I have to say about that.\n" )
app = webapp2.WSGIApplication([('/', MainPage)],
debug=True)