-
Notifications
You must be signed in to change notification settings - Fork 1
/
ngram.go
54 lines (48 loc) · 1.43 KB
/
ngram.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
package disfun
import (
"github.com/juju/utils/set"
)
//Ngram is continuous sequence of n-items from a given sequence. The distance is the relative number of items between these two sequences.
//
// References:
//
// https://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
// https://en.wikipedia.org/wiki/N-gram
// http://m.wolframalpha.com/input/?i=n-grams+%22n-gram+example+of+n-grams+in+wolfram+alpha%22&x=0&y=0
type Ngram struct {
Set1 set.Strings
Set2 set.Strings
S1 string
S2 string
N int
}
// NewNgram initializes and creates data structures for the Ngram struct, which you can then call Similarity() on.
func NewNgram(n int, s1, s2 string) *Ngram {
set1 := set.NewStrings()
set2 := set.NewStrings()
return &Ngram{
Set1: set1,
Set2: set2,
S1: s1,
S2: s2,
N: n,
}
}
// JaccardCoEfficient calculates the similarity of two sets as the intersection divided by the union of the two sets.
func (n *Ngram) JaccardCoEfficient() float64 {
return float64(n.Set1.Intersection(n.Set2).Size()) / float64(n.Set1.Union(n.Set2).Size())
}
// Build creates Set1 and Set2 of the Ngram.
func (n *Ngram) Build() {
for i := 0; i < (len(n.S1) - (n.N) + 1); i++ {
n.Set1.Add(n.S1[i : i+n.N])
}
for i := 0; i < (len(n.S2) - (n.N) + 1); i++ {
n.Set2.Add(n.S2[i : i+n.N])
}
}
// Similarity bulds the two ngram sets and returns their similarity.
func (n *Ngram) Similarity() float64 {
n.Build()
return n.JaccardCoEfficient()
}