-
Notifications
You must be signed in to change notification settings - Fork 10
/
domainExtractor.py
119 lines (96 loc) · 3.68 KB
/
domainExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from datetime import date, datetime
import os, re, sys, argparse, urllib.parse, logging, requests
parser = argparse.ArgumentParser(
description="This script will extract domains from the file you specify and add it to a final file"
)
parser.add_argument('--file', action="store", default=None, dest='inputFile',
help="Specify the file to extract domains from")
parser.add_argument('--url', action="store", default=None, dest='url',
help="Specify the web page to extract domains from. One at a time for now")
parser.add_argument('--target', action="store", default='all', dest='target',
help="Specify the target top-level domain you'd like to find and extract e.g. uber.com")
parser.add_argument('--verbose', action="store_true", default=False, dest='verbose',
help="Enable slightly more verbose console output")
args = parser.parse_args()
if not len(sys.argv) > 1:
parser.print_help()
print()
exit()
### Set the logger up
if not os.path.exists('logs'):
os.makedirs('logs')
logfileName = "logs/newdomains.{}.log".format(args.target)
logging.basicConfig(filename=logfileName, filemode='a',
format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
outputFile = "final.{}.txt".format(args.target)
def extractDomains(args, inputFile, rawData):
domains = []
if not args.target:
print("No target specified, defaulting to finding 'all' domains")
for i in rawData:
matches = re.findall(r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}', urllib.parse.unquote(urllib.parse.unquote(i)))
if not args.target.lower() == 'all':
for j in matches:
if j.find(args.target.lower()) != -1:
domains.append(j)
else:
for j in matches:
if j.find('.com') != -1:
domains.append(j)
elif j.find('.net') != -1:
domains.append(j)
elif j.find('.org') != -1:
domains.append(j)
elif j.find('.tv') != -1:
domains.append(j)
elif j.find('.io') != -1:
domains.append(j)
print("File: {} has {} possible domains...".format(inputFile, len(rawData)))
return domains
results = []
# If files are specified, check them
if args.inputFile:
fileList = args.inputFile.split(',')
for inputFile in fileList:
try:
with open(inputFile, 'r') as f:
rawData = f.read().splitlines()
except UnicodeDecodeError:
with open(inputFile, 'r', encoding="ISO-8859-1") as f:
rawData = f.read().splitlines()
results += extractDomains(args, inputFile, rawData)
# If a URL is specified, pull that
if args.url:
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"}
rawData = requests.get(args.url, headers=headers)
rawData = rawData.text.split('\n')
results += extractDomains(args, args.url, rawData)
# sort and dedupe our results
finalDomains = sorted(set(results))
# read all the domains we already have.
try:
with open(outputFile, 'r') as out:
oldDomains = out.read().splitlines()
# If no final file, create one
except FileNotFoundError:
print("Output file not found. Creating one...")
with open(outputFile, 'w') as out:
for i in finalDomains:
out.write("{}\n".format(i))
print("{} domains written to output file {}".format(len(finalDomains), outputFile))
# loop through fresh domains. If we don't already have it, add it to final file, notify us, log it.
else:
newDomains = []
with open(outputFile, 'a') as out:
for i in finalDomains:
if i not in oldDomains:
newDomains.append(i)
out.write("{}\n".format(i))
if newDomains:
print("{} new domains were found and added to {}".format(len(newDomains), outputFile))
for i in newDomains:
logger.info("New domain found: {}".format(i))
else:
print("No new domains found.")