-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform.py
33 lines (23 loc) · 829 Bytes
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from gensim import corpora, models, similarities
from build import tokenize
if __name__ == '__main__':
dictionary = corpora.Dictionary.load('state/big5.dict')
corpus = corpora.MmCorpus('state/big5.mm')
print(corpus)
tfidf = models.TfidfModel(corpus)
print(tfidf)
# Transform individual vector
new_doc = 'She arrived a couple of days later'
new_vec = dictionary.doc2bow(tokenize(new_doc))
print(tfidf[new_vec])
# Transform the whole corpus
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(2)
for doc in corpus_lsi:
print(doc)
lsi.save('state/big5.lsi')
lsi = models.LsiModel.load('state/big5.lsi')