-
Notifications
You must be signed in to change notification settings - Fork 4
/
prepare_dataset.py
110 lines (91 loc) · 3.71 KB
/
prepare_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding:utf-8 -*-
import jieba
import xlrd
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
stopword_dict = "/home/zqh/mygit/semanaly/dataset/stop_words_ch.txt"
stop_dict = {}
with open(stopword_dict) as d:
for word in d:
stop_dict[word.strip("\n")] = 1
def rm_stopwords(words):
# read stop word dict and save in stop_dict
tmp_list = [] # save words not in stop dict
for word in words:
if word not in stop_dict:
tmp_list.append(word)
return tmp_list
data_set = {}
def get_data_from_excel_v2(excel_path, head, is_first=True, cut_words=False):
"""
get label data from excel
:return: data_set dict
"""
data = xlrd.open_workbook(excel_path)
table = data.sheets()[0]
text = ""
uid = table.cell(0, 0).value
first_class = table.cell(0, 1).value
second_class = table.cell(0, 2).value
with open("./data/mobile_dataset_jieba.csv", "a") as f1, open("./data/mobile_dataset_no_label_jieba.csv", "a") as f2:
if is_first:
f1.write("id,text,first_class,second_class\n")
for i in range(1, table.nrows):
if head in table.cell(i, 0).value:
f1.write(uid + "," + text.strip() + "," + first_class + "," + second_class + "\n")
f2.write(text.strip() + "\n")
uid = table.cell(i, 0).value
first_class = table.cell(i, 1).value
second_class = table.cell(i, 2).value
text = ""
elif table.cell(i, 0).value != "":
sentence = table.cell(i, 1).value
if len(sentence) < 2:
continue
# sentence = re.findall(ur"[\u4e00-\u9fa5]+", sentence)
if cut_words:
words = jieba.cut(sentence, cut_all=False)
sentence = " ".join(words)
# words = rm_stopwords(words)
text += sentence + ' '
f1.write(uid + "," + text + "," + first_class + "," + second_class + "\n")
f2.write(text + "\n")
def get_data_from_excel_v3(excel_path, head, is_first=True, cut_words=False):
"""
get label data from excel
:return: data_set dict
"""
data = xlrd.open_workbook(excel_path)
table = data.sheets()[0]
text = ""
uid = table.cell(3, 0).value
with open("./data/mobile_dataset_jieba_fish.csv", "w") as f1, open("./data/mobile_dataset_no_label_jieba_fish.csv", "w") as f2:
if is_first:
f1.write("id,text\n")
for i in range(4, table.nrows):
if head in table.cell(i, 0).value:
if text != "":
f1.write(uid + "," + text.strip() + "\n")
f2.write(text.strip() + "\n")
uid = table.cell(i, 0).value
text = ""
elif table.cell(i, 0).value != "":
sentence = table.cell(i, 1).value
if len(sentence) < 2:
continue
# sentence = re.findall(ur"[\u4e00-\u9fa5]+", sentence)
if cut_words:
sentence = "".join(sentence.split())
words = jieba.cut(sentence, cut_all=False)
sentence = " ".join(words)
# words = rm_stopwords(words)
text += sentence + ' '
f1.write(uid + "," + text.strip() + "\n")
f2.write(text + "\n")
excel_path1 = "data/guangxi.data.ok.target.machine_0.xlsx"
excel_path2 = "data/henan.data.ok.target.machine_0.xlsx"
excel_path3 = "data/beiyou.0811.xlsx"
# get_data_from_excel_v2(excel_path1, "guangxi", cut_words=True)
# get_data_from_excel_v2(excel_path2, "henan", is_first=False, cut_words=True)
get_data_from_excel_v3(excel_path3, "170", cut_words=True)