This repository has been archived by the owner on Dec 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cache-extractor.py
302 lines (251 loc) · 10.2 KB
/
cache-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# cache-extractor.py
#
import os
import time
import md5
import binascii
import zlib
import json
from urlparse import urlparse
from datetime import datetime
from bs4 import BeautifulSoup as bs
# set cache_dir from which to extract files
cache_dir = "./data/squid3"
# set whitelist for http headers
whitelist = ['Location','Content-Type','Content-Encoding','Content-Language','Expires','Date','Last-Modified','Vary','Mime-Version','Server','Content-Length','Keep-Alive','Cache-Control','Access-Control-Allow-Origin','ETag','P3P','Set-Cookie','Via','Aka-DNS-Name','Content-Security-Policy','Content-Security-Policy-Report-Only','X-Cache','X-Amz-Cf-Id','X-Served-By','X-Powered-By','X-CDN','X-Frame-Options','X-Content-Type-Options','X-Varnish','Strict-Transport-Security']
# hex byte parse of squid_meta, headers, payload from a cache_file
def parse_cache_file(cache_file):
# open the cache_file on disk and read
with open(cache_file, 'rb') as cache_file_raw:
# create dictionary to store all parsed fields
cache_file_parsed = {}
cache_file_parsed['references'] = []
# store file_path and swap_filen
cache_file_parsed['file_path'] = cache_file
cache_file_parsed['swap_filen'] = cache_file.split("/").pop()
# parse url, headers, payload from cache_file
got_squid_meta = False
payload = b""
byte = cache_file_raw.read(1)
str_buff = byte
# read all file bytes
while byte != "":
# check for \r\n (next header)
# hex bytes 0d0a = \r\n
if byte.encode('hex') == "0d":
byte2 = cache_file_raw.read(1)
if byte2.encode('hex') == "0a":
# if got_squid_meta this is a header
if got_squid_meta:
try:
key = str_buff.split(":",1)[0].strip()
value = str_buff.split(":",1)[1].strip()
# only add header if in whitelist
if key in whitelist:
cache_file_parsed[key] = value
except IndexError:
print("Error: IndexError in parse_cache_file at cache_file: %s"
% cache_file)
# parse_squid_meta from buffer
else:
cache_file_parsed.update(
parse_squid_meta(str_buff))
got_squid_meta = True
str_buff = ""
# check for double \r\n (payload)
word2 = cache_file_raw.read(2)
if word2.encode('hex') == "0d0a":
# payload, read to EOF
payload = cache_file_raw.read()
# word2 starts next header
else:
byte = ""
str_buff = word2
# not a new header, add bytes to buffer
else:
str_buff = str_buff + byte + byte2
# not a new header, add byte to buffer
else:
str_buff = str_buff + byte
# read another byte
byte = cache_file_raw.read(1)
# md5 response payload
md5sum = md5.new()
md5sum.update(payload)
cache_file_parsed['payload_md5'] = md5sum.hexdigest()
# add a ref for HTTP redirects (301,302)
if "Location" in cache_file_parsed.keys():
cache_file_parsed['references'].append(
{'ref': cache_file_parsed['Location'],
'type': 'redirect'})
# check for gzip file signatures
# hex bytes 1f8b08 = gzip signature
if payload.encode('hex')[:6] == "1f8b08":
payload = decompress_gzip(payload)
# check for zip file signature
# hex bytes 504b0304 = zip signature (PKZIP archive_1)
if payload.encode('hex')[:6] == "504b0304":
payload = decompress_pk(payload)
# if content is text/html parse the payload for references
if "Content-Type" in cache_file_parsed.keys():
if cache_file_parsed['Content-Type'] == "text/html":
cache_file_parsed['references'] = (
cache_file_parsed['references']
+ parse_references(payload,cache_file_parsed))
# convert some header date formats to standard unix timestamp
cache_file_parsed = convert_time_strings(cache_file_parsed)
# md5 response payload after any decompression
md5sum2 = md5.new()
md5sum2.update(payload)
cache_file_parsed['payload_md5_decompressed'] = md5sum2.hexdigest()
# return cache_file_parsed data structure
return cache_file_parsed
# parse squid_meta in cache_file header
def parse_squid_meta(squid_meta):
squid_meta_parsed = {}
# parse squid_meta from cache_file
cache_key = squid_meta[11:27] # md5 cache_key (hex string)
timestamp = squid_meta[32:39] # timestamp (host endianness bytes)
lastref = squid_meta[40:47] # lastref (host endianness bytes)
expires = squid_meta[48:55] # expires (host endianness bytes)
lastmod = squid_meta[56:63] # lastmod (host endianness bytes)
refcount = squid_meta[72:73] # refcount (host endianness bytes)
flags = squid_meta[74:76] # flags (host endianness bytes)
url_ver_code = binascii.b2a_hex(squid_meta[81:]) # remainder of string
url = ""
http_ver = 0
http_code = 0
try:
url = binascii.a2b_hex( # url (string)
url_ver_code[:url_ver_code.find("000a")]) # 000a terminated
http_ver = binascii.a2b_hex( # http_ver (string)
url_ver_code[url_ver_code.find("48545450"):] # 48545450 = HTTP
).split()[0]
http_code = int(binascii.a2b_hex( # http_code (string)
url_ver_code[url_ver_code.find("48545450"):] # 48545450 = HTTP
).split()[1])
url_parse = urlparse(url)
# try to get root domain from hostname
root_domain = url_parse.hostname.split(".")
root_domain = ".".join(len(root_domain[-2]) < 4 and root_domain[-3:] or root_domain[-2:])
squid_meta_parsed['meta_cache_key'] = binascii.b2a_hex(cache_key)
squid_meta_parsed['meta_timestamp'] = int(binascii.b2a_hex(timestamp[::-1]), 16)
squid_meta_parsed['meta_lastref'] = int(binascii.b2a_hex(lastref[::-1]), 16)
squid_meta_parsed['meta_expires'] = int(binascii.b2a_hex(expires[::-1]), 16)
squid_meta_parsed['meta_lastmod'] = int(binascii.b2a_hex(lastmod[::-1]), 16)
squid_meta_parsed['meta_refcount'] = int(binascii.b2a_hex(refcount[::-1]), 16)
squid_meta_parsed['meta_flags'] = int(binascii.b2a_hex(flags[::-1]), 16)
squid_meta_parsed['url'] = url
squid_meta_parsed['url_host'] = url_parse.netloc
squid_meta_parsed['url_tld'] = url_parse.netloc.rpartition(".")[2]
squid_meta_parsed['url_domain'] = root_domain
squid_meta_parsed['url_scheme'] = url_parse.scheme
squid_meta_parsed['url_path'] = url_parse.path.rpartition("/")[0]
squid_meta_parsed['url_file'] = url_parse.path.rpartition("/")[2]
squid_meta_parsed['http_ver'] = http_ver
squid_meta_parsed['http_code'] = http_code
except IndexError as e:
print("Error: IndexError in parse_squid_meta at cache_key: %s"
% binascii.b2a_hex(cache_key))
except ValueError as e:
print("Error: ValueError in parse_squid_meta at cache_key: %s"
% binascii.b2a_hex(cache_key))
except TypeError as e:
print("Error: TypeError in parse_squid_meta at cache_key: %s"
% binascii.b2a_hex(cache_key))
return squid_meta_parsed
# decompress a gzipped response payload
def decompress_gzip(response_payload):
decompressed = ""
try:
decompressed = zlib.decompress(response_payload, zlib.MAX_WBITS | 16)
except zlib.error:
print("Error: zlib decompression error in decompress_gzip")
return decompressed
# decompress deflate/zip/PK response payload
def decompress_pk(response_payload):
decompressed = ""
try:
decompressed = zlib.decompress(response_payload, -zlib.MAX_WBITS)
except zlib.error:
print("Error: zlib decompression error in decompress_pk")
return decompressed
# parse out references from response_payload (currently HTML only)
def parse_references(response_payload,cache_file_parsed):
# create a list of references
references = []
# create a new BeautifulSoup object from the HTML
soup = bs(response_payload, 'html.parser')
# get all hrefs
for ref in soup.find_all('a'):
insert = {}
insert['ref'] = convert_abs_url(ref.get('href'),cache_file_parsed)
insert['type'] = "href"
if not insert['ref'] == None:
references.append(insert)
for ref in soup.find_all('link'):
insert = {}
insert['ref'] = convert_abs_url(ref.get('href'),cache_file_parsed)
insert['type'] = "href"
if not insert['ref'] == None:
references.append(insert)
# get all srcs
for ref in soup.find_all('img'):
insert = {}
insert['ref'] = convert_abs_url(ref.get('src'),cache_file_parsed)
insert['type'] = "src"
if not insert['ref'] == None:
references.append(insert)
# return a list containing dicts with refs and their types
return references
# standardize urls in refs to absolutes. cfp is cache_file_parsed dict
def convert_abs_url(ref,cfp):
# don't try to convert a ref that doesn't exist
if ref == None:
return None
# already absolute, probably
if ref[:4] == "http":
return (ref).encode('ascii','ignore')
# relative (root)
if ref[:1] == "/":
return (cfp['url_scheme']+"://"+cfp['url_host']+ref).encode('ascii','ignore')
# relative (traveral..)
if ref[:3] == "../":
trimmed_path = cfp['url_path'].rpartition("/")[0]
return (cfp['url_scheme']+"://"+cfp['url_host']+trimmed_path+ref[2:]).encode('ascii','ignore')
# relative (more traveral..)
if ref[:2] == "./":
return (cfp['url_scheme']+"://"+cfp['url_host']+cfp['url_path']+ref[1:]).encode('ascii','ignore')
# relative
return (cfp['url_scheme']+"://"+cfp['url_host']+cfp['url_path']+"/"+ref).encode('ascii','ignore')
# standardize time formats to unix time
def convert_time_strings(cache_file_parsed):
# list of header values we want to convert
time_headers = ["Expires", "Date", "Last-Modified"]
for key in time_headers:
if key in cache_file_parsed:
try:
timestring = cache_file_parsed[key]
# try to parse datetime string and write as unix timm
cache_file_parsed[key] = int(time.mktime(
datetime.strptime(timestring, "%a, %d %b %Y %H:%M:%S GMT").timetuple()))
except ValueError:
print("Error: ValueError in convert_time_strings at cache_key: %s"
% cache_file_parsed['meta_cache_key'])
return cache_file_parsed
# print out the configured cache_dir
print("cache_dir: '%s'" % cache_dir)
# get list of all cache_files
cache_files = []
for path, dirs, files in os.walk(cache_dir):
if path != cache_dir:
for cache_file in files:
cache_files.append(("%s/%s" % (path, cache_file)))
# iterate through all cache_files to extract squid_meta, headers, and payload
output = open('cache-extractor.json', 'w')
for cache_file in cache_files:
try:
output.write(json.dumps(parse_cache_file(cache_file))+"\n") # just write this to a file for now, manual cleanup
except UnicodeDecodeError:
print("Error: UnicodeDecodeError because unicode has to make everything difficult")
output.close()