-
Notifications
You must be signed in to change notification settings - Fork 3
/
domain_analysis.py
156 lines (141 loc) · 4.97 KB
/
domain_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import csv, traceback, requests
import cPickle as pickle
from urlparse import urlparse
import sys
import whois
from newspaper import Article
GUARDIAN_API_KEY = '091d775d-0f0e-4924-b2e9-61fd1900f1c7'
def isInDictionary(d,url):
list_sites = d.keys()
#see if the hostname is found in the list provided by open sources
if url.hostname in list_sites:
return d[url.hostname]
else:#loop through all keys and see if there may be a match with the given URL.
for key in list_sites:
if key in url.hostname:
return d[key]
return -1
def isDomainReputable(url):
non_credible_news = {}
credible_news = {}
#open the list of non credible news sources
with open('open_sources_list.csv','r') as csvfile:
my_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
#need to skip the first line of the site
my_reader.next()
for row in my_reader:
non_credible_news[row[0]]=row[1]
parsed_uri = urlparse(url)
value = isInDictionary(non_credible_news,parsed_uri)
if value is not -1:
return value
with open('credible.csv','r') as csvfile:
my_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
#need to skip the first line of the site
my_reader.next()
for row in my_reader:
site = row[0]
if site.startswith('"') and site.endswith('"'):
site = site[1:-1]
credible_news[site]="credible"
value = isInDictionary(credible_news,parsed_uri)
if value is not -1:
return value
return "Site Not Found in our data list!"
#returns a dictioanry of relevant whois information to help
def getWhoisCreationDate(hostname):
whois_d = {}
w = whois.whois(hostname)
creation_date = w['creation_date']
#print creation_date
#print hostname
if hostname == 'independent.co.uk':
yr = 1996
return yr
yr = 0
try:
for obj in creation_date:
if yr< obj.year:
yr = obj.year
except TypeError:
yr = creation_date.year
return yr
def calcAvgCreationDateAge(list_domains):
list_dates = []
sum = 0
for dom in list_domains:
try:
yr = getWhoisCreationDate(dom)
print yr, dom
sum+=yr
list_dates.append(yr)
except AttributeError:
print "Failed on dom "+dom
traceback.print_exc()
return float(sum)/float(len(list_dates))
# pickle.dump(list_dates,'creation_dates.cp')
# print list_dates
def getListOfNewsDomains(file_path):
list_dom =[]
with open(file_path,'r') as csvfile:
my_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
#need to skip the first line of the site
my_reader.next()
for row in my_reader:
site = row[0]
if site.startswith('"') and site.endswith('"'):
site = site[1:-1]
list_dom.append(site)
return list_dom
#TODO
def getAuthorsOtherWorks(article):
return False
def getGuardianContent(keywords):
content_string = ''
for keyword in keywords:
content_string= content_string + keyword+'%20'
r = requests.get('https://content.guardianapis.com/search?q='+ content_string+'&api-key='+GUARDIAN_API_KEY)
results = r.json()
list_results = results['response']['results']
list_guardian_articles = []
for json_d in list_results:
list_guardian_articles.append(json_d['webUrl'])
return list_guardian_articles
if __name__ == "__main__":
if len(sys.argv)<2:
url1 = "http://breitbart.com/big-government/2017/04/08/sen-mcconnell-supreme-court-vacancy-key-president-trumps-win/"
url2 = "https://www.nytimes.com/2017/04/08/world/middleeast/us-strike-on-syria-brings-fleeting-hope-to-those-caught-in-brutal-conflict.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news"
print "Beginning Domain Analysis... "
print urlparse(url1).hostname + " is considered "+isDomainReputable(url1)
print urlparse(url2).hostname + " is considered "+isDomainReputable(url2)
else:
url= sys.argv[1]
news_hostname = urlparse(url).hostname
print news_hostname+ " is considered "+isDomainReputable(url)
#print "News Domain was created on "+str( getWhoisInformation(news_hostname))
list_credible_domains = getListOfNewsDomains('credible.csv')
print('Avg Domain Creation Date of Credible News Sites is 1994.95 ')
list_non_credible_domains = getListOfNewsDomains('open_sources_list.csv')
print(news_hostname+' was first registered on '+str(getWhoisCreationDate(news_hostname)))
#use the newspaper framework to download the article and find data
article = Article(url)
article.download()
article.parse()
#Use Google API to find top links about the author
list_otherwork=getAuthorsOtherWorks(article.authors)
#num_quotes = 0
#for c in article.text:
# print c
# if c == '"' or c=='\'':
# num_quotes+=1
#print "Author: "+str(article.authors)
#print article.title
#print "LENGTH OF ARTICLE "+str(len(article.text)) +" characters"
article.nlp()
#print "Article Keywords: "+str(article.keywords)
#print article.top_image
#print "Article Summary: "+article.summary
#GET GUARDIAN CONTENT
list_guardian_articles = getGuardianContent(article.keywords)
print list_guardian_articles
#print "Number of Quotation Marks in "+ article.title+ " is "+ str(num_quotes)