-
Notifications
You must be signed in to change notification settings - Fork 0
/
build.py
102 lines (69 loc) · 2.35 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from collections import Counter
from collections import namedtuple
from pprint import pprint
from gensim import corpora
from nltk.corpus import stopwords
# List of 127 stop words
STOPS = frozenset(stopwords.words('english'))
TRAIN_ROOT = 'data/train/'
Author = namedtuple('Author', ['name', 'id'])
Text = namedtuple('Text', ['file', 'author'])
def is_stop(word):
"""Is `word` a stopword?"""
return word.lower() in STOPS
def is_stop_test():
assert is_stop('the')
assert is_stop('tHe')
assert is_stop('aN')
assert is_stop('A')
def tokens_from_file(fname):
with open(fname) as f:
for line in f:
words = unstop(tokenize(line))
for word in words:
yield word
def remove_stops(docs):
return [[word for word in doc if not is_stop(word)] for doc in docs]
def remove_rare(docs):
counts = Counter(word for doc in docs for word in doc)
return [[word for word in doc if counts[word] > 1] for doc in docs]
def tokenize(string):
"""Turn a string into a list of (lowercase) words."""
return string.lower().split()
def tokenize_test():
assert tokenize('') == []
assert tokenize('How are you?') == ['how', 'are', 'you?']
assert tokenize('how') == ['how']
assert tokenize('How') == ['how']
def test():
is_stop_test()
tokenize_test()
def unstop(lst):
return [word for word in lst if not is_stop(word)]
if __name__ == '__main__':
test()
tolst = Author('Tolstoy', 0)
dosto = Author('Dostoevsky', 1)
dickens = Author('Dickens', 2)
flaubert = Author('Flaubert', 3)
austen = Author('Austen', 4)
texts = [
Text('anna_karenina.txt', tolst),
Text('brothers_karamazov.txt', dosto),
Text('little_dorrit.txt', dickens),
Text('madame_bovary.txt', flaubert),
Text('pride_and_prejudice.txt', austen),
]
docs = [list(tokens_from_file(TRAIN_ROOT + text.file))
for text in texts]
pprint(docs[1][:10])
docs = remove_rare(docs)
pprint(docs[1][:10])
dictionary = corpora.Dictionary(docs)
dictionary.save('state/big5.dict')
corpus = [dictionary.doc2bow(text) for text in docs]
corpora.MmCorpus.serialize('state/big5.mm', corpus)
print(corpus[1][:10])
new_doc = 'She arrived a couple of days later'
new_vec = dictionary.doc2bow(tokenize(new_doc))
print(new_vec)