-
Notifications
You must be signed in to change notification settings - Fork 0
/
cross_reference.py
137 lines (118 loc) · 4.59 KB
/
cross_reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from __future__ import print_function
import json
import os
import re
import urllib
from dotenv import load_dotenv
from serpapi import GoogleSearch
load_dotenv(".env")
serp_api_key = os.getenv("SERPAPI_API_KEY")
google_knowledge_graph_api_key = os.getenv("GOOGLE_KG_API_KEY")
def with_serp(query_string):
"""
with_serp uses the Serp API to provide Google Search results
The API key is pulled using the config() method from config.py
Additional help information can be found on `https://serpapi.com/`
input:
query_string : type = str, some google search query you want
output:
processed_results : type = bool, True if queried results provide reputable answers
False if not
WARNING: limited queries (100 - 10000)
"""
params = {
"q": query_string.lower(),
"api_key": serp_api_key
}
search = GoogleSearch(params) # complete Google search with params
del params # delete the configuration dictionaries that hold the api key
results = search.get_dict()
return process_results(query_string, results) # check if results are good
def process_results(query_string, results):
"""
Process the results from the google search query to determine if the queried
recommendation candidate is acceptable.
This implementation of process_results checks to see if all query terms are
included in the web result. This does not consider knowledge graph results.
input:
query_string : type = str, the google search query which provided results
results : type = dict, the results of the Google Search
output:
boolean : True if this candidate is acceptable and exists
False if not
link : type = str, the search result that trusts this candidate
"""
words = query_string.split(' ')
for web_result in results['organic_results']:
count = 0
for word in words:
if word in ' '.join(web_result['about_this_result']['keywords']).lower() or \
word in web_result['title'].lower() or word in web_result['snippet'].lower():
count += 1
if count == len(words):
print(web_result['link'])
return True, web_result['link']
return False, None
def clean_string(s):
"""
Clean the string (ie, remove all common punctuation characters)
input:
s : type = str, the string that needs to be cleaned
output:
str : the processed string
"""
s = re.sub('["!#$%*]', '', s) # remove bad chars
return s
def gkg_query(query_string, threshold=1, print_results=False):
"""
Use Google's Knowledge Graph Search API call and analyze the results to check
if the output is reasonable for our search query
input:
query_string : type = str, some google search query you want
threshold : type = int, accept all query results which have do not have these
many words in their detailed description from the query string,
default = 1 -- only one missing word will be tolerated
"""
params = {
'query': query_string,
'limit': 10,
'indent': True,
'key': google_knowledge_graph_api_key
}
# query KG
url = 'https://kgsearch.googleapis.com/v1/entities:search' + '?' + urllib.parse.urlencode(params)
if print_results:
print(url, end="\n\n")
response = json.loads(urllib.request.urlopen(url).read())
# process results
query_string = clean_string(query_string)
if print_results:
print(response)
for result in response['itemListElement']:
if result['resultScore'] < 1:
continue
word_count = 0
for word in query_string.split():
try:
if word.lower() in clean_string(result['result']['detailedDescription']['articleBody'].lower()).split():
word_count += 1
continue
except:
return (False, None)
if word_count >= len(query_string.split()) - threshold:
if print_results:
print(f"Query of `{query_string}` found TRUE by the following search result:\n")
print(result)
return True,
return False, None
if __name__ == '__main__':
query_string = 'vscode ide'
print('Query:', query_string, end='\n\n\n\n')
res1 = gkg_query(query_string, threshold=1, print_results=True)
res2 = False # with_serp(query_string)
if res1:
print("SUCCESS 1")
elif res2[0]:
print("SUCCESS 2")
else:
print("FAILURE")