-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
44 lines (32 loc) · 1.37 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from tqdm import tqdm
from image_utils import TextProcessor
from models import Document, Word
text_processor = TextProcessor()
while True:
query_text = input("Search:")
query_hash = text_processor.hash(query_text)
query_words = text_processor.tokenize(query_text)
print("Words:", query_words)
query_document = Document(hash=query_hash,
text=query_text,
words=query_words)
documents = []
for word_text in query_document.words:
word = Word.objects(text=word_text).first()
if not word:
print("Word {word} not found!".format(word=word_text))
continue
print("[{word}] - {count} documents found.".format(
word=word_text, count=len(word.documents)))
documents.extend(word.documents)
documents = set(documents)
top_documents = []
for document in tqdm(documents, total=len(documents)):
sim_document_query = text_processor.calc_sim(document,
query_document)
if sim_document_query > 0:
top_documents.append([sim_document_query, document.text])
top_documents = sorted(top_documents, reverse=True)[:10]
for index, document in enumerate(top_documents):
print(index, ":", document[1][:1000], "Similarity: ", document[0],
"\n")