-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_htr.py
95 lines (72 loc) · 3.2 KB
/
run_htr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import logging
import os
from typing import Any
from xml.etree import ElementTree
from tqdm import tqdm
from TkbsApiClient import TranskribusClient
from TkbsDocument import Document
from utilities import add_transkribus_args, find_existing, gather_document_folders, init_tkbs_connection, load_document, save_job_indication, setup_logging, setup_parser
def get_args():
parser = setup_parser()
add_transkribus_args(parser)
args = parser.parse_args()
return args
def htr_in_doc(doc: dict):
return doc['md']['nrOfTranscribedLines'] > 0
def run_htr(tkbs: TranskribusClient, tkbs_htr_model_id: int, collection_id: int, tkbs_doc_id: int, tkbs_doc: dict) -> int:
json_dict = {
"docId": tkbs_doc_id,
"pageList":
{
"pages":
[ { 'pageId': page['pageId'] } for page in tkbs_doc['pageList']['pages']]
}
}
response = tkbs.htrRnnDecode(collection_id, tkbs_htr_model_id, "trainDataLanguageModel", tkbs_doc_id, json.dumps(json_dict), bDictTemp=False)
logging.debug(response)
try:
jobid = int(response or 'xxx')
except:
raise ValueError(f"Can't parse job id '{response}'")
return jobid
def main():
args = get_args()
setup_logging(args)
tkbs = init_tkbs_connection(args)
print(f'Running HTR all documents from Trankribus collection {args.tkbs_collection_id}')
logging.info(f'Running HTR on all documents from Trankribus collection {args.tkbs_collection_id}')
logging.debug('Loading documents from Transkribus')
existing_docs = tkbs.listDocsByCollectionId(args.tkbs_collection_id)
jobs_issued = skipped = missing = 0
folders = list(gather_document_folders(args.base))
for folder in tqdm(folders):
doc = load_document(folder)
existing = find_existing(doc, existing_docs)
if not existing:
logging.warning(f"Can't locate document for {folder}, skipping")
missing += 1
continue
tkbs_doc_id = int(existing['docId'])
logging.debug(f'Loading document {tkbs_doc_id} from Transkribus')
tkbs_doc = tkbs.getDocById(args.tkbs_collection_id, tkbs_doc_id)
if htr_in_doc(tkbs_doc):
if not args.overwrite:
logging.info(f'Skipping {doc.title}, it has already been segmented')
skipped += 1
continue
output_folder = os.path.join(folder, 'transkribus_output')
if os.path.exists(output_folder):
if not args.overwrite:
logging.info(f'Skipping {doc.title}, it already has a transkribus output')
skipped += 1
continue
# Run segmentation
logging.info(f'Starting HTR on document {doc.title}')
job_id = run_htr(tkbs, args.tkbs_htr_model_id, args.tkbs_collection_id, tkbs_doc_id, tkbs_doc)
save_job_indication(folder, job_id)
# jobid = mytkbs.htrRnnDecode(collection, HTRmodelid, dictionaryName, mydocid, jstring, bDictTemp=False)
jobs_issued += 1
print(f'Done, {jobs_issued} jobs issued, {missing} documents missing, {skipped} documents skipped')
if __name__ == '__main__':
main()