-
Notifications
You must be signed in to change notification settings - Fork 15
/
DBNF.hs
427 lines (354 loc) · 13.6 KB
/
DBNF.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
module DBNF where
import qualified Data.Set as S
import qualified Data.Map.Strict as M
import Data.Char
import Data.List
import System.Environment (getArgs)
-- main: read DBNF grammar (dependency BNF), parse stdio line by line, print
-- bracketed parse trees and CoNLL dependency trees
-- example: echo "the cat is old" | runghc RuleBased.hs grammars/English.dbnf Text
main = do
xx <- getArgs
case xx of
grfile:startcat:opts -> processRuleBased grfile startcat opts
_ -> putStrLn usage
processRuleBased grfile startcat opts = do
gr <- readFile grfile >>= return . pGrammar
-- putStrLn $ unlines $ map show (rules gr) ---- debug
putStrLn $ checkGrammar gr
interact (unlines . map (processOne gr startcat opts) . filter (not . all isSpace) . lines)
usage = "usage: RuleBased <grammar> <startcat> (-parsetrees | -deptrees) (-<number>)"
processOne :: Grammar -> Symb -> [String] -> String -> String
processOne gr cat opts s = case opts of
_ | elem "-onlyparsetrees" opts -> unlines ["# parsetree = " ++ prParseTree pt | pt <- parses]
_ -> unlines $ [
"# text = " ++ s,
"# analyses = " ++ show (length rts)
] ++ dtreess
where
dtreess = map unlines [
["# parsetree = " ++ prParseTree pt, "# weight = " ++ show (treeWeight pt), dt] | (pt,dt) <- zip parses dtrees
]
doshow = case filter (isPrefixOf "-show") opts of
cut:_ -> take (read (drop 6 cut))
_ -> take 1
docut = case filter (isPrefixOf "-cut") opts of
cut:_ -> take (read (drop 5 cut))
_ -> id
dtrees = map (prDepTree . markDependencies gr) parses
parses = doshow rts
rts = rankTrees $ docut $ totals ++ chunks
(totals,chunks) = parse gr cat (words s)
-- chart parsing from Peter Ljunglöf, "Pure Functional Parsing", 2002
-- p. 15
type Symb = String
type Token = String
data Rule = Rule {
constr :: Symb,
lhs :: Symb,
rhs :: [Symb],
labels :: [Symb],
weight :: Double
}
deriving Show
data Grammar = Grammar {
rules :: [Rule],
terminalmap :: M.Map Token [Symb], -- token to cats
catmap :: M.Map Symb Symb, -- cat to UD pos
posmap :: M.Map Symb [Symb], -- pos to cats
depmap :: M.Map Symb Symb, -- pos to label
dirdepmap :: M.Map (Symb,Symb,Bool) Symb -- (pos,headpos,ifbefore) to label
}
deriving Show
terminal :: Grammar -> Token -> [Symb]
terminal g t =
let
ts =
[c | Just cs <- [M.lookup t (terminalmap g)], c <- cs] ++
[c | (s, Just _) <- [unPOS t], Just cs <- [M.lookup s (terminalmap g)], c <- cs] ++
[c | (s, Just p) <- [unPOS t], c <- lookpos p]
in if (null ts) then ["Str"] else ts
where
lookpos p = [c | pm <- normalize p, Just cs <- [M.lookup pm (posmap g)], c <- cs]
-- match if e.g. grammar gives PRON_PronType=Rel, input has PRON_Case=Acc|PronType=Rel
normalize p = case break (=='_') p of
(pos,[]) -> [pos]
(pos,_:feats) -> pos:[pos ++ "_" ++ fs | (po,_:fs) <- allpos, po == pos, isInfixOf fs feats] -- TODO: only one feature considered
allpos = map (break (=='_')) $ M.keys (posmap g)
-- instead of having a word in the lexicon, mark it in input as word:<POS> where POS matches a category
--- a bit complicated because of 11:30:<NUM>
unPOS :: Token -> (Token,Maybe Symb)
unPOS t = case break (==':') (reverse t) of
(p@(_:_),_:s) | head p == '>' && last p == '<' -> (reverse s, Just (reverse (tail (init p))))
_ -> (t,Nothing)
-- lower weight to ad-hoc words
tokenWeight :: Token -> Double
tokenWeight t = maybe 0.5 (const 0.2) $ snd $ unPOS t
emptyGrammar = Grammar [] M.empty M.empty M.empty
-- p. 64
type Edge = (Int,Symb,[Symb])
type State = S.Set Edge
type Chart = [State]
buildChart :: Grammar -> [Token] -> Chart
buildChart grammar input = finalChart
where
finalChart :: Chart
finalChart = map buildState initialChart
initialChart :: Chart
initialChart = S.empty : map initialState (zip [0..] input)
initialState :: (Int,Token) -> State
initialState (i,tok) = S.fromList [(i,cat,[]) | cat <- terminal grammar tok]
buildState :: State -> State
buildState = limit more
more :: Edge -> State
more e@(j,a,cats) = case cats of
[] -> S.fromList [(j, lhs r, bs) |
r <- rules grammar,
a':bs <- [rhs r],
a == a'
]
`S.union`
S.fromList [(i, b, bs) |
(i, b, a':bs) <- S.toList (finalChart !! j),
a == a'
]
_ -> S.empty
-- p. 14
limit :: Ord a => (a -> S.Set a) -> S.Set a -> S.Set a
limit more start = limit' start start
where
limit' old new
| S.null new' = old
| otherwise = limit' (S.union new' old) (S.difference new' old)
where
new' = S.unions (S.toList (S.map more new))
-- p. 66
type Passive = (Int,Int,Symb)
passiveEdges :: Chart -> [Passive]
passiveEdges chart = [
(i, j, cat) |
(j,state) <- zip [0..] chart,
(i,cat,[]) <- S.toList state
]
data ParseTree =
PT (Symb,Symb,[Symb],Double) [ParseTree] -- (cat,constr,labels,weight) subtrees
| PL (Symb,Token) (Int,Symb,Int,Double) -- (cat,terminal) (position, label, head, weight)
deriving Show
buildTrees :: Grammar -> [Token] -> [Passive] -> [(Passive,[ParseTree])]
buildTrees grammar input passives = edgeTrees
where
edgeTrees :: [(Passive,[ParseTree])]
edgeTrees = [(pe, treesFor pe) | pe <- passives]
treesFor (i,j,cat) = [
PT (cat, constr rule, labels rule, weight rule) trees |
rule <- rules grammar,
lhs rule == cat, ---- TODO: rule <- rules grammar cat
trees <- children (rhs rule) i j
] ++ [
PL (cat,fst (unPOS tok)) (j,"dep",0, tokenWeight tok) | -- default label and head
i == j-1,
let tok = input !! i,
elem cat (terminal grammar tok)
]
children :: [Symb] -> Int -> Int -> [[ParseTree]]
children cs i k = case cs of
[] -> [[] | i == k]
c:cs -> [
tree : rest |
i <= k,
((i',j,c'),trees) <- edgeTrees,
i == i', c == c',
rest <- children cs j k,
tree <- trees
]
parse :: Grammar -> Symb -> [Token] -> ([ParseTree],[ParseTree])
parse grammar cat input = (completes,chunks)
where
completes = maybe [] id $ lookup (0, length input, cat) $ subtrees
chunks = chunkParses grammar subtrees
subtrees = buildTrees grammar input (passiveEdges (buildChart grammar input))
-- longest match parsing, head in different places, head-final first
chunkParses :: Grammar -> [(Passive,[ParseTree])] -> [ParseTree]
chunkParses gr subtrees =
[PT node subs | subs <- subss, node <- chunknodes subs ]
where
subss = [
next 0 subtreelist -- left to right longest match
---- ,prev mx revsubtreelist -- right to left longest match
]
chunknodes subs = [
("Chunks", "chunks", take n dlabs ++ ["head"] ++ drop (n+1) dlabs, 0.0001)
| let len = length subs,
n <- [0..len - 1],
let sposs = [ (t,posOf t) | t <- subs],
let hpos = snd (sposs !! n),
let dlabs = [dirLabelOf t (p,hpos,i<n) | (i,(t,p)) <- zip [0..] sposs]
]
next :: Int -> [((Int,Int),ParseTree)] -> [ParseTree]
next i sl = case sl of
[] -> []
((k,j),t):ss | k == i -> t : next j ss
_:ss -> next i ss
prev :: Int -> [((Int,Int),ParseTree)] -> [ParseTree]
prev i sl = case sl of
[] -> []
((k,j),t):ss | j == i -> t : prev k ss
_:ss -> next i ss
subtreelist :: [((Int,Int),ParseTree)]
subtreelist = sortOn (\ ((i,j),_) -> (i,0-j))
[((i,j),t) |
((i,j,_),ts) <- subtrees, t:_ <- [ts] --- [rankTrees ts]
]
revsubtreelist = reverse subtreelist
mx = case revsubtreelist of ((_,j),_):_ -> j ; _ -> 0
dirLabelOf t pqb = maybe (labelOf t) id $ M.lookup pqb (dirdepmap gr)
labelOf t = maybe "dep" id $ M.lookup (catOf t) (depmap gr)
posOf t = maybe "X" id $ M.lookup (catOf t) (catmap gr)
catOf t = case t of
PT (cat,_,_,_) _ -> cat
PL (cat,_) _ -> cat
-- context-free weight ("probability") of a tree
treeWeight :: ParseTree -> Double
treeWeight = tprob
where
tprob t = case t of
PT (_,f,_,w) ts -> product (w : map tprob ts)
PL _ (_,_,_,w) -> w
-- sort by descending weight
rankTrees :: [ParseTree] -> [ParseTree]
rankTrees = sortOn ((1-) . treeWeight)
-- mark dependency labels and heads in leaf nodes
-- simplified version of Kolachina and Ranta, LiLT 2016
markDependencies :: Grammar -> ParseTree -> ParseTree
markDependencies grammar =
mark ("root",0) .
annotate
where
annotate pt = case pt of
PT (cat,fun,ds,w) pts -> PT (cat,fun, ds, w) (map annotate pts)
PL (cat,tok) info -> PL (lookc cat,tok) info
mark (lab,hd) pt = case pt of
PL tok (i,_,_,w) -> PL tok (i,lab,hd,w)
PT (cat,fun,labs,w) pts ->
PT (cat,fun,labs,w) [
markIf (lab,hd) (l,h) l t
| let tls = zip pts labs,
let h = headTok tls,
(t,l) <- tls
]
markIf labhd lh l t = case l of
"head" -> mark labhd t
_ -> mark lh t
headTok tls = case filter ((=="head") . snd) tls of
(PL _ (i,_,_,_),_):_ -> i
(PT (_,_,ls,_) ts,_):_ -> headTok (zip ts ls)
lookc cat = maybe cat id (M.lookup cat (catmap grammar))
------------------------------
-- printing trees
------------------------------
prParseTree :: ParseTree -> String
prParseTree pt = case pt of
PT (cat,fun,_,_) pts -> parenth (unwords (trim cat : map prParseTree pts))
PL (cat,tok) _ -> parenth (unwords [trim cat, trim tok])
where
parenth s = "(" ++ s ++ ")"
trim c = case c of --- for printing the tree via GF, make identifiers valid
'\'':_ -> c ++ "'"
x:xs | not (isLetter x && all (\y -> isAlphaNum y || elem y "_'") xs) -> "'" ++ c ++ "'"
_ -> c
prDepTree :: ParseTree -> String
prDepTree = unlines . map prOne . getTokens
where
getTokens pt = case pt of
PT _ pts -> concatMap getTokens pts
PL (pos,tok) (i,lab,hd,_) -> [(show i,tok,pos,show hd,lab)]
prOne (i,t,p,h,d) = concat (intersperse "\t" [i,t,unc,pos,unc,mor,h,d,unc,unc])
where (pos,mor) = case break (=='_') p of
(pos,_:mor) -> (pos,mor)
_ -> (p,unc)
unc = "_"
------------------------------
-- textual format of DBNF grammars
-------------------------------
-- Simple format, one rule per line, comment lines start --.
-- Phrase structure rule format: LHS ::= RHS (# labels (# weight))?
--
-- Cl ::= NP do not VP # nsubj aux advmod head # 0.7
--
-- All symbols in LHS and RHS are nonterminals.
-- Terminals are introduces by rules of form: #token NONTERM words
--
-- #token do do does did done
-- #token N man men cat cats
--
-- Universal pos tags are defined by rules of form: #pos POS nonterms
--
-- #pos ADV Adv IAdv not
--
pGrammar :: String -> Grammar
pGrammar = combine . addRules . map words . filter relevant . lines
where
relevant l = case l of
'-':'-':_ -> False
_ | all isSpace l -> False
_ -> True
combine (rs,ts,cs,ds,dds) = Grammar
(numRules rs)
(M.fromListWith (++) ts)
(M.fromList (("Str","X") : cs))
(posm cs)
(M.fromList ds)
(M.fromList dds)
addRules = foldr addRule ([],[],[],[],[])
addRule ws g@(rs,ts,cs,ds,dds) = case ws of
"#token":c:ww -> (rs, [(w,[c]) | w <- ww] ++ ts, cs,ds,dds)
"#pos":c:ww -> (rs, ts,[(w,c) | w <- ww] ++ cs,ds,dds)
"#dep":p:l:_ -> (rs, ts, cs, (p,l):ds,dds)
"#dirdep":p:q:b:l:_ -> (rs, ts, cs, ds, ((p,q,read b),l):dds)
c:"::=":ww -> (
expandRule (getRule (unwords ws) c (splitSemic ww)) ++ rs, ts,cs,ds,dds)
_ -> error ("rule not parsed: " ++ unwords ws)
expandRule r = [
Rule (constr r) (lhs r) cs labs (weight r) |
(cs,labs) <- combinations (zip (map optionalize (rhs r)) (labels r))
]
-- optional category with ?, e.g. neg? -> Left neg
optionalize c = case c of
_ | last c == '?' -> Left (init c)
_ -> Right c
combinations :: [(Either Symb Symb,Symb)] -> [([Symb],[Symb])]
combinations cls = [
unzip (concat cl) |
let combs (c,l) = case c of
Left c -> [[(c,l)], []]
Right c -> [[(c,l)]],
let mcls = map combs cls,
cl <- sequence mcls
]
getRule s c wws = case wws of
[cs,labs,[p]] -> Rule "" c cs (fixLabs cs labs) (read p)
[cs,labs] -> Rule "" c cs (fixLabs cs labs) 0.5
[cs] -> Rule "" c cs (fixLabs cs []) 0.5
_ -> error ("ill-formed rule: " ++ s)
fixLabs cs labs = if length cs == 1 then ["head"] else labs
numRules rs = [Rule ("R" ++ show i) c cs labs p |
(i,Rule _ c cs labs p) <- zip [1..] rs]
posm cs = M.fromListWith (++) [(p,[c]) | (c,p) <- cs]
splitSemic ws = case break (flip elem [";","#"]) ws of
(cs,_:rest) -> cs : splitSemic rest
([],_) -> []
(cs,_) -> [cs]
checkGrammar :: Grammar -> String
checkGrammar g = case checks g of
[] -> []
cs -> error $ unlines $ "Errors in grammar:" : cs
where
checks g =
["invalid labels in " ++ show r | r <- rules g, length (rhs r) /= length (labels r) || noUniqueHead r]
noUniqueHead r = length (lhs r) > 0 && length (filter (=="head") (labels r)) /= 1
prGrammar :: Grammar -> String
prGrammar gr = unlines $
[unwords ([lhs r, "::="] ++ rhs r ++ ["#"] ++ labels r ++ ["#"] ++ [show (weight r)]) | r <- rules gr] ++
[unwords ["#token", cat, tok] | (tok,cats) <- M.assocs (terminalmap gr), cat <- cats] ++
[unwords ("#pos" : pos : cats) | (pos,cats) <- M.assocs (posmap gr)] ++
[] ---- dep,dirdep