-
Notifications
You must be signed in to change notification settings - Fork 3
/
tagger.py
141 lines (119 loc) · 5.04 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys, os, pathlib, json
from os import listdir
from os.path import isfile, join
import re
import time
dicts = {}
def path_to_paper_id(path):
'''
returns a paper_id given the input path of the file
'''
return path.split("/")[-1][:-5]
def read_article(path):
'''
returns a list with the title, abstract and body_text from given input path
'''
ret = []
with open(path) as f:
d = json.load(f)
ret.append((d["metadata"]["title"], "title"))
for t in d["abstract"]:
ret.append((t["text"], "abstract"))
for t in d["body_text"]:
ret.append((t["text"], "body_text"))
return ret
def read_meta(paperid, metaf):
'''
returns a list of [cord_uid, sourcedb, sourceid, obj]
from input paperid and metafile. If not found returns [0,0,0,0]
'''
metafile = open(metaf, "r")
for line in metafile:
if paperid in line:
metaline = line.split(",")
cord_uid = metaline[0]
sourcedb = metaline[2]
sourceid = metaline[5]
obj = metaline[-1]
return [cord_uid, sourcedb, sourceid, obj]
return [0,0,0,0]
def setup_dicts(dicts_folder_path):
'''
takes the folder path of the dictionaries (classes) as input and sets up the keys
from the .txt file names of the classes. The keys and values are stored in the
variable dicts and each key will be mapped to a list of phrases sorted on length
with longest first.
'''
dicts_paths = [dicts_folder_path + "/" + f for f in os.listdir(dicts_folder_path)]
for d in dicts_paths:
cat = d.split("/")[-1][:-4]
phrase_list = [line.strip() for line in open (d)]
phrase_list.sort(key = len, reverse=True)
dicts[cat] = phrase_list
def tag_article(article_path, metaf):
'''
takes input article_path and associated metafile and returns the denotated_sections
for one complete article. Tagging is done section for section for the input article.
Matches from two different dictionaries are allowed, but not from the same dictionary.
Longer matches are prioritized.
'''
article = read_article(article_path)
denotated_sections = []
obj = read_meta(path_to_paper_id(article_path), metaf)[3]
for section in article:
section = section[0].lower()
section = section.replace('-', ' ')
denotations = []
for cat in dicts.keys():
#below are the most important lines in the program
#re_or will for each cat (category) be a string built up by the sorted
#content of the corresponding dictionary. This in order to
#give a correct input prioritizing format for the re.finditer which
#stores the output information in matches.
s = ""
re_or = "(" + s.join([x + "|" for x in dicts[cat]])[:-1] + ")"
matches = [(cat, m.start(0), m.end(0)) for m in re.finditer(re_or, section)]
if(len(matches) > 0):
for match in matches:
match_dict = {"id": match[0], "span":{"begin":match[1], "end":match[2]}, "obj":match[0]}
denotations.append(match_dict)
denotated_sections.append(denotations)
return denotated_sections, article
def generate_jsons(denotated_sections, article, path, metaf):
'''
Generates output json files. One file is generated for each section of an article.
One file for the title, one for the abstract and one for each section of the body text.
The files will be named based on their cord_uid and their section like follows:
cord_uid-div_id-sectiontype.json
eg: 31996494-0-title.json
31996494-1-abstract.json
'''
[cord_uid, sourcedb, sourceid, obj] = read_meta(path_to_paper_id(path), metaf)
for i in range(len(article)):
text = article[i][0]
section = article[i][1]
denot = denotated_sections[i]
json_data = {"cord_uid":cord_uid,
"sourcedb":sourcedb,
"sourceid":sourceid,
"div_id":i,
"text":text,
"denotations":denot
}
with open(str(cord_uid) + "-" + str(i) + "-" + section + ".json", "w") as fp:
json.dump(json_data, fp)
def main():
articles_path = os.path.abspath("comm_use_subset_100") + "/"
#articles_path = os.path.abspath("gold_standard_subset_10") + "/"
articles = [f for f in listdir(articles_path) if isfile(join(articles_path, f))]
metaf = "meta_subset_100.csv"
#metaf = "gold_standard_subset_10.csv"
setup_dicts("/home/jesper/EDAN70/classes")
for f in articles:
denot_sec, art = tag_article(articles_path + f, metaf)
#generate_jsons(denot_sec, art, f, metaf)
if __name__ == '__main__':
t0 = time.clock()
main()
t1 = time.clock() - t0
print(t1)