-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
40 lines (34 loc) · 1.79 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import math
from typing import Callable, Optional
from nltk.collocations import TrigramCollocationFinder
# SMOOTHING FUNCTIONS
# Functions for Lidstone smoothing
def lidstone_smooth(lambda_value: float, total_trigrams: int, trigram_counts: dict, b_value: int, trigram: tuple):
number = trigram_counts.get(trigram, 0)
probs = (number + lambda_value) / (total_trigrams + lambda_value * b_value)
return probs
def pau_discounting(delta: float, total_trigrams: int, trigram_counts: dict, trigram: tuple, b_value: int = 0):
number = trigram_counts.get(trigram, 0)
unique = len(trigram_counts)
prob = max(number - delta, 0) / total_trigrams + (delta * unique / total_trigrams) * (1 / unique)
return prob
def absolute_discounting(alpha: float, total_trigrams: int, trigram_counts: dict, b_value: int, trigram: tuple):
count_trigram = trigram_counts.get(trigram, 0)
unique = len(trigram_counts)
if count_trigram == 0:
prob = (unique * alpha / (b_value - unique)) / total_trigrams
else:
prob = ((count_trigram - alpha) / total_trigrams)
return prob
def linear_discounting(alpha: float, total_trigrams: int, trigram_counts: dict, b_value: int, trigram: tuple):
count_trigram = trigram_counts.get(trigram, 0)
if count_trigram == 0:
prob = alpha / (b_value - len(trigram_counts))
else:
prob = (1-alpha)*(count_trigram / total_trigrams)
return prob
def probs_total(b_value: int, trigram_finder: object, model: dict, total_trigrams: int, smooth: Callable = lidstone_smooth, param: float = 0.5):
prob_sec = 0
for trigram, num_instances in trigram_finder.ngram_fd.items():
prob_sec += num_instances * math.log(smooth(param, trigram=trigram, b_value=b_value, total_trigrams=total_trigrams, trigram_counts=model))
return prob_sec