-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
171 lines (144 loc) · 6.75 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import logging
import sys
import os
import subprocess
from bs4 import BeautifulSoup
from thefuzz import fuzz
import math
from header import *
def dumpMobi(mobiPath, force=False) -> str:
"""dumps mobi file to a folder via kindle unpack"""
workingDirectory: str = os.getcwd()
logging.info(f"Dumping mobi file to {os.getcwd()}/dump")
if "dump" not in os.listdir(workingDirectory):
logging.info(f"Making mobi dump directory")
os.mkdir('dump')
fileSplit: list[str] = mobiPath.split(".")
if fileSplit[0] == "":
"""Relative paths previously broke file extension checking"""
fileSplit.pop(0)
if len(fileSplit) != 2:
#ensure valid filename
logging.exception(f"Did not receive path to mobiPath with a valid filename (no or multiple extensions), instead received \"{mobiPath}\"")
raise ValueError
filename: str = os.path.basename(mobiPath)
filename: str = filename.split(".")[0]
if filename not in os.listdir(f"{workingDirectory}/dump/"):
logging.info(f"Creating subdirectory dump/{filename}, dumping.")
subprocess.run(["python","kindle-unpack/kindleunpack.py", mobiPath, f"{workingDirectory}/dump/{filename}"])
elif force == True:
logging.info(f"Found pre-existing mobi dump, overwriting.")
for toDelete in os.listdir(f"{os.getcwd()}/filename"):
file_path = os.path.join(f"{os.getcwd()}/filename", toDelete)
os.remove(file_path)
subprocess.run(["python","kindle-unpack/kindleunpack.py", mobiPath, f"{workingDirectory}/dump/{filename}"])
else:
logging.info(f"Found pre-existing mobi dump, keeping it.")
return f"{workingDirectory}/dump/{filename}"
def getContentFile(inFolder: str) -> str:
"""Parses passed dumped mobi folder and generates a list of xhtml/html files which contain the books text content"""
"""With mobi *should* be only one file, the list is a holdover from epub parsing"""
#thrawn alliances part 2 starts at loc 540
#crea[{"confidenceLevel" : int, "text" : matching text, "file" : path to html file}]tes list of all files in dumped mobi folder
logging.info("Getting content file")
fileList: list[str] = []
#Searches for html/xhtml files and saves their path
for root, dirs, files in os.walk(inFolder):
for file in files:
filename=os.path.join(root, file)
split=file.split(".")
if len(split) == 1:
#no file extension
continue
elif ((split[1] == "html") or (split[1]=="xhtml")):
fileList.append(filename)
if len(fileList) == 0:
logging.CRITICAL(f"No html file found in dump directory {inFolder}")
raise FileNotFoundError
elif len(fileList) > 1:
logging.debug(f"{len(fileList)} content files found, expected 1. Keeping first file {contentFile}")
else:
#all good
contentFile: str = fileList[0]
logging.info(f"Content file {contentFile} found in folder {inFolder}")
return contentFile
def findMatch(inputFile: str, searchText: str, confidenceLevel: int = 80, poorMatchMargin: int = 10, promptPoorMatches: bool = True) -> Match:
"""Parses the specified xhtml/html file within the specified ebook and searche ebook line which matches the transcripted text"""
matches: list[Match] = []
poorMatches: list[Match] = []
result: Match
with open(inputFile, "r") as html:
soup=BeautifulSoup(html, "xml", from_encoding="utf-8") #init beautiful soup
logging.debug(f"Searching {inputFile} for text: {searchText} with confidence level {confidenceLevel}")
#Iterates over entire html text and does a string comparison for every tag... needs optimization
for tag in soup.body.find_all(True):
result: Match = compareText(transcribedText=searchText, ebookText=tag.text)
if result.confidenceLevel > (confidenceLevel - poorMatchMargin) :
result.file=inputFile
if result.confidenceLevel >= confidenceLevel:
matches.append(result)
elif result.confidenceLevel >= (confidenceLevel-poorMatchMargin):
#fall back list if no matches meet confidence level
poorMatches.append(result)
logging.info(f"Found matching string with {result.confidenceLevel}% accuracy in {inputFile}:\n {searchText}matches\n {tag.text}")
else:
pass
if len(matches) > 1:
result = max(matches, key=lambda x: x.confidenceLevel) #one liner to select highest confidence match
else:
try:
result = matches[0]
except IndexError:
#list is empty because no matches were found, if searchUntilMatch is True will execute again with lower confidence
logging.info(f"No matching text found for string {searchText}")
print(f"No matches found for transcribed string:\n-{searchText}")
if len(poorMatches) >= 1:
if promptPoorMatches:
print("="*90)
print(f"However poorer match(es) were found with at least {confidenceLevel-poorMatchMargin} certainty")
print(f"If one of the text(s) below is the correct match for the transcribed text [1] please enter its number, or 0 if none.")
print(f"[0] : {searchText}")
for index, match in enumerate(poorMatches):
print(f"[{index+1}] : {match.text}")
inp=input("Enter correct string number, or 0 to skip: ")
if inp not in range(1,len(poorMatches)+1):
result = None
else:
logging.info(f"User selected correct matching string")
result = poorMatches[inp-1]
else:
result = max(poorMatches, key=lambda x: x.confidenceLevel) #one liner to select highest confidence match
else:
print(f"Nor any matches within {poorMatchMargin}% of {confidenceLevel}.")
return result
def compareText(transcribedText: str, ebookText: str) -> Match:
"""Makes two strings as similar as possible then uses fuzzy text matching to compare two strings"""
sLine: str = ebookText
searchText: str = transcribedText
line: str = sLine.lower() #the transcription will be all lowercase, so this will improve accuracy
#split strings by whitespace to count words
tempTarg: list[str]=searchText.split()
tempLine: list[str] = line.split()
finLine: list[str] = []
#trim text from ebook to same length of transcription for better comparison
if len(tempLine) > len(tempTarg):
for word in tempLine:
if len(finLine) < len(tempTarg):
finLine.append(word)
else:
finLine = tempLine
lineText = " ".join(finLine) #concatonate finLine back together
confidence: int = fuzz.ratio(searchText, lineText)
return Match(confidenceLevel = confidence, text = sLine, file = None, location = None) #{"confidenceLevel" : confidence, "text" : sLine}
def findBytes(filePath: str, searchText: str) -> int:
"""Calculates number of bytes preceeding each string found from findMatch"""
with open(filePath,"rb") as file:
position: int = file.read().find(searchText.encode("utf-8"))
logging.info(f"Found byte position: {position} of searchText {searchText}")
return position
def calculateLocation(numBytes) -> int:
loc: int = math.floor((numBytes * 2) / 300 + 1)
logging.info(f"Found kindle location: {loc} for byte position: {numBytes}")
return int(loc)
if __name__ == '__main__':
pass