-
Notifications
You must be signed in to change notification settings - Fork 15
/
GF2UD.hs
313 lines (256 loc) · 11.5 KB
/
GF2UD.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
module GF2UD where
import RTree
import UDConcepts
import GFConcepts
import UDAnnotations
import UDOptions
import PGF hiding (CncLabels)
import qualified Data.Map as M
import Data.List
import Data.Char
import Data.Maybe
---------
-- to test
-- env <- getEnv
testString i opts env s = do
let eng = actLanguage env
let t = parseEng env s
testTree i opts env t
testTreeString i opts env s = do
case readExpr s of
Just t -> testTree i opts env t
_ -> error $ "not a well-formed tree: " ++ s
testTree i opts env t = do
let eng = actLanguage env
ifOpt opts "gf" $ showExpr [] t
let e0 = expr2annottree env eng t
ifOpt opts "an0" $ prLinesRTree prAnnotNode e0
let e = annotTree2labelledTree env eng e0
ifOpt opts "an1" $ prLinesRTree prAnnotNode e
let u0 = labelledTree2wordTree e
ifOpt opts "an2" $ prLinesRTree prAnnotNode u0
let u1 = applyNonlocalAnnotations env eng u0
ifOpt opts "an3" $ prLinesRTree prAnnotNode u1
let u2 = adjustUDTree env (wordTree2udTree u1)
ifOpt opts "ut" $ prUDTree u2
let u = adjustUDIds (udTree2sentence u2)
ifOpt opts "ud" $ prUDSentence i u
ifOpt opts "err" $ unlines (errors u)
-- if isOpt opts "vud" then (visualizeUDSentences env [u]) else return ()
return u
-- richly annotated abstract tree, with extra nodes for each token
type AnnotTree = RTree AnnotNode
data AnnotNode = AnnotNode {
anFun :: CId,
anCat :: CId,
anLabel :: String, -- UD label coming from above
anTarget :: Maybe String, -- intended label for syncat words not targeted to heads
anToken :: Maybe (Int,TokenInfo) -- token belonging to this node, with position number in sentence
}
deriving (Eq,Show)
--- anTokens :: [(Int,(String,String,String,[UDData],String))] -- tokens dominated by this node
data TokenInfo = TokenInfo {
tokWord :: String,
tokLemma :: String,
tokPOS :: String,
tokMorpho :: [UDData]
}
deriving (Eq,Show)
prAnnotNode (AnnotNode f c l tg ts) =
unwords $
intersperse ";" ["@" ++ show i ++ ": " ++ prTokenInfo t | Just (i,t) <- [ts]] ++
[maybe "" ('>':) tg] ++
[showCId f,showCId c,l]
prTokenInfo (TokenInfo w l p m) = unwords [w,l,p,prt m]
---------------------------
-- the GF to UD pipeline
---------------------------
gf2ud :: UDEnv -> Language -> PGF.Tree -> UDSentence
gf2ud env lang =
adjustUDIds --- hack: should check why needed
. udTree2sentence
. adjustUDTree env
. wordTree2udTree
. labelledTree2wordTree
. annotTree2labelledTree env lang
. expr2annottree env lang
adjustUDTree :: UDEnv -> UDTree -> UDTree
adjustUDTree env tr@(RTree node trs) = RTree (adjust node atrs) atrs
where
atrs = map (adjustUDTree env) trs
adjust n ts = case M.lookup (udDEPREL n) changemap of
Just [(e,CAbove d)] | any ((==d) . udDEPREL . root) ts -> n{udDEPREL = e}
Just [(e,CFeatures fs)] | all (flip elem (udFEATS n)) fs -> n{udDEPREL = e}
_ -> n
changemap = changeLabels (cncLabels env)
-- change node structure, create links to heads
wordTree2udTree :: AnnotTree -> UDTree
wordTree2udTree = annot udIdRoot where
annot udid tr@(RTree node trs) =
let (position,tok) = case anToken node of
Just pt -> pt
_ -> error $ "no position or token from node " ++ prAnnotNode node
in RTree
(UDWord {
udID = UDIdInt position,
udFORM = tokWord tok,
udLEMMA = let l = tokLemma tok in if null l then tokWord tok else l,
udUPOS = tokPOS tok,
udXPOS = showCId (anCat node),
udFEATS = tokMorpho tok,
udDEPREL = unHeadLabel (anLabel node),
udHEAD = udid,
udDEPS = "_",
udMISC = [UDData "FUN" [showCId (anFun node)]] ----
})
[annot (UDIdInt position) t | t <- trs]
-- apply operations that change the tree structure
applyNonlocalAnnotations :: UDEnv -> Language -> AnnotTree -> AnnotTree
applyNonlocalAnnotations env lang =
lowerHead
where
lowerHead tree@(RTree node trs) = RTree node (changes trs trs)
changes bts ts = case ts of
t:tt -> case anTarget (root t) of
Just label -> case break (\t -> anLabel (root t) == label) bts of
(bts1,h:bts2) -> changes (dropOut t (bts1 ++ [h{subtrees = t : subtrees h}] ++ bts2)) tt
---- _ -> error $ "target " ++ label ++ " not found among\n" ++ unlines (map (prAnnotNode . root) bts) --- t : changes bts tt
_ -> changes bts tt ---- ?
_ -> changes bts tt
[] -> bts
dropOut t ts = filter (\u -> root t /= root u) ts
-- erase intermediate nodes, building a tree of words
labelledTree2wordTree :: AnnotTree -> AnnotTree
labelledTree2wordTree tr@(RTree node trs) =
RTree
node
[labelledTree2wordTree t | t <- trs, isJust (anToken (root t))]
-- assign labels to functions and propagate them down to leaves
annotTree2labelledTree :: UDEnv -> Language -> AnnotTree -> AnnotTree
annotTree2labelledTree env lang =
propagateLabels
. addLabels root_Label
where
pgf = pgfGrammar env
lookFun f = maybe [([],defaultLabels)] id (M.lookup f (funLabels (absLabels env)))
defaultLabels = head_Label:repeat dep_Label
propagateLabels tr@(RTree node trs) =
RTree
(followSpine node trs)
[propagateLabels t | t <- concatMap dependentTrees trs]
dependentTrees tr@(RTree node trs) =
if (anLabel node /= head_Label)
then [tr]
else concatMap dependentTrees trs
followSpine node trs = case trs of
[] -> node
_ -> case [(n,ts) | RTree n ts <- trs, anLabel n == head_Label] of
[(n,ts)] -> followSpine n{anLabel = anLabel node} ts
_ -> error $ unlines $ "ERROR: no unique head among" : map (prAnnotNode . root) trs
-- labels added from fun annotations; for syncat words, from their lemma annotations
--- relying on the order proper nodes + syncat word nodes
addLabels label tr@(RTree node trs) = case lookFun (anFun node) of
fsls -> case funLabelMatches fsls (map (anFun . root) trs) of
ls:_ -> RTree --- make sure the first match is the best!
(if anLabel node == dep_Label then node{anLabel = label} else node) -- don't change a predefined label
[addLabels lab t | (lab,t) <- zip (ls++[anLabel (root t) | t <- trs, isJust (anToken (root t))]) trs]
funLabelMatches psls fs = [ls |
(ps,ls) <- psls,
all funMatch (zip fs ps)
] ++ [defaultLabels]
funMatch (f,p) = maybe True (f==) p -- Nothing matches any function
-- decorate abstract tree with word information
expr2annottree :: UDEnv -> Language -> Tree -> AnnotTree
expr2annottree env lang tree =
addWordsAndCats
$ postOrderRTree
$ expr2abstree
tree
where
pgf = pgfGrammar env
lookCat c = maybe x_POS fst (M.lookup c (catLabels (absLabels env)))
lookWord d w = maybe d id (M.lookup w (wordLabels (cncLabels env)))
lookupLemma f w = lookupFunLemma env lang f w -- (fun,lemma) -> (label,targetLabel)
lookMorpho d c i = maybe d id (M.lookup (c,i) (morphoLabels (cncLabels env))) -- i'th form of cat c
lookupDiscont c i = (M.lookup (c,i) (discontLabels (cncLabels env))) -- (cat,field) -> (pos,label,targetLabel)
lookAuxPos c = maybe x_POS id (M.lookup c (auxCategories (cncLabels env))) -- auxcat -> pos --- auxcat should really be language-dependent
lookupMulti c = M.lookup c (multiLabels (cncLabels env))
-- convert postorder GF to AnnotTree by adding words from bracketed linearization, categories and their UD pos tags, morpho indices
addWordsAndCats (RTree (f,i) ts) = RTree node (map addWordsAndCats ts ++ toktrees)
where
node = AnnotNode {
anFun = f,
anCat = cat,
anTarget = Nothing,
anToken = Nothing,
anLabel = dep_Label
}
toktrees = case (M.lookup i positions) of
Just pws -> case lookupMulti cat of
Just (headfirst,lab) -> case pws of
_:_:_ -> case headfirst of
True -> addLemma (head pws) : [forceLabel lab pw | pw <- tail pws]
False -> [forceLabel lab pw | pw <- init pws] ++ [addLemma (last pws)]
_ -> map addLemma pws -- 0 or 1 words
_ -> headsFirst $ map addLemma pws
_ -> [] ---
headsFirst ts = case partition (\t -> anLabel (root t) == dep_Label) ts of --- heads of discont
(hs,nhs) -> hs ++ nhs
(cat,isLeaf) = valCat (anFun node)
forceLabel lab (posit,(w,lind)) = RTree node{
anToken = Just (posit, TokenInfo w w (lookCat cat) []), --- no morphology, works for particles and prepositions
anLabel = lab
} []
addLemma (posit,(w,lind)) = case isLeaf of
True -> case lookupDiscont cat lind of
Just (pos,label,target) | label == head_Label -> -- head of discontinuous constituent
RTree node{
anLabel = dep_Label, -- to be redefined in annotTree2labelledTree.addLabels
anToken = Just (posit,
TokenInfo w
(mkLemma (anFun node))
(lookCat cat)
(lookMorpho (formData lind) (anCat node) lind)
)
} []
Just (pos,label,target) | label /= head_Label -> -- discontinuous parts, such as verb particles and prepositions
RTree node{
anToken = Just (posit, TokenInfo w w pos []), --- no morphology, works for particles and prepositions
anLabel = label,
anTarget = if target /= head_Label then Just target else Nothing
} []
_ -> RTree node{ -- categorematic single words
anToken = Just (posit,
TokenInfo w
(mkLemma (anFun node))
(lookCat cat)
(lookMorpho (formData lind) (anCat node) lind)
)
} []
_ -> case lookWord (w,[]) w of -- not leaf: syncat words
(lemma,morph) -> case lookupLemma f lemma of
Just (auxcat,(label,target)) -> RTree node{
anLabel = label,
anTarget = if target /= head_Label then Just target else Nothing,
anToken = Just (posit, TokenInfo w lemma (lookAuxPos auxcat) morph)
} []
_ -> RTree node{
anToken = Just (posit, TokenInfo w lemma x_POS morph)
} []
mkLemma f = unwords $ take 1 $ words $ linearize pgf lang (mkApp f []) --- if multiword, 1st word is lemma; e.g. "listen to"
positions = bracketPositions $ case bracketedLinearize pgf lang tree of
b:_ -> b
_ -> error ("ERROR: no linearization for tree " ++ showExpr [] tree)
valCat f = case functionType pgf f of
Just typ -> case unType typ of
(hs,cat,_) -> (cat, null hs) -- valcat, whether atomic
_ -> error ("ERROR: cannot find type of function " ++ showCId f)
-- for each in-order abs-node, find the words that the node dominates, with their position and morpho
bracketPositions :: BracketedString -> M.Map FId [(Int,(String,LIndex))]
bracketPositions = M.map reverse . collect . numerate . pos 0 0
where
pos fid lindex bs = case bs of
Bracket _ fi _ lind _ _ bss -> concatMap (pos fi lind) bss
Leaf s -> [(fid,(w,lindex)) | w <- words s] -- separate words --- every word gets the same morpho index
numerate fws = [(f,(p,wl)) | (p,(f,wl)) <- zip [1..] fws] -- assign a position number to every word
collect fpws = M.fromListWith (++) [(f,[pwl]) | (f,pwl) <- fpws] -- map abstree nodes to word information