-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
313 lines (271 loc) · 12.6 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import re
import nltk
import pandas as pd
from ast import literal_eval
import numpy as np
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from data_exploration import DataExploration
class DataPreprocessing:
"""
Data preprocessing methods that we tried but most did not yield good results
"""
def __init__(self, df):
self.df = df
@staticmethod
def get_train_target(in_csv, out_csv):
"""
Generate a csv file with avg-rating column (average rating of all reviews for a product) and target column
(1 if product is awesome, 0 if not)
:param in_csv: input CSV path
:param out_csv: output CSV path
"""
train_data = pd.read_csv(in_csv)
# get mean of each review for every product
avg_rating_df = train_data[['amazon-id', 'overall']].groupby('amazon-id').mean()
full_train_data = pd.merge(train_data, avg_rating_df, how='inner', on='amazon-id')
full_train_data = full_train_data.rename(columns={'overall_y': 'avg-rating', 'overall_x': 'overall'})
# target is integer
full_train_data['target'] = full_train_data.apply(lambda row: 1 if row['avg-rating'] > 4.5 else 0, axis=1)
# many nulls in this col
full_train_data.drop('first-release-year', axis=1, inplace=True)
full_train_data.to_csv(out_csv)
def get_labels_frequencies(self):
"""
Gets frequency that values in 'label' column occur with target=1 (awesome) and target=0 (not awesome)
"""
# dictionary of 'label' value to number of rows with that value and target = 1
dict_label_to_positive_review = {}
# dictionary of 'label' value to number of rows with that value and target = 0
dict_label_to_negative_review = {}
for index, row in self.df.iterrows():
DataExploration.update_dict_frequency(row['label'], row['target'], dict_label_to_positive_review,
dict_label_to_negative_review)
good_labels = {}
for label in dict_label_to_negative_review:
if dict_label_to_negative_review[label] > 0 and \
dict_label_to_positive_review[label] / dict_label_to_negative_review[label] > 5:
good_labels[label] = None
return good_labels
def get_categories_frequencies(self):
"""
Gets frequency that values in 'categories' column occur with target=1 (awesome) and target=0 (not awesome)
"""
self.df['categories'] = self.df['categories'].apply(literal_eval)
# dictionary of 'categories' value to number of rows with that value and target = 1
dict_category_to_positive_review = {}
# dictionary of 'categories' value to number of rows with that value and target = 0
dict_category_to_negative_review = {}
for index, row in self.df.iterrows():
for category in row['categories']:
DataExploration.update_dict_frequency(category, row['target'], dict_category_to_positive_review,
dict_category_to_negative_review)
good_cats = {}
for category in dict_category_to_negative_review:
if dict_category_to_negative_review[category] > 0 and \
dict_category_to_positive_review[category] / dict_category_to_negative_review[category] > 4:
good_cats[category] = None
return good_cats
@staticmethod
def categorize_helper(x):
"""
Categorize values of 'unix_help_percent' into manually selected thresholds
:param x: value to be categorized
:return: category
"""
# We found using the crosstab that many of the non-5 star reviews were within a certain range
# We therefore cut the data into two "boxes" that using the unixtime*helpful variable could be separated
# These two boxes included 1/20th boxes which had 8% and 20-30% 5 star reviews, as opposed to 75-80% for the
# reviews not in range
if 0.0 < x < 2.154:
return 1 # 8 percent
elif 2.154 < x < 8.022:
return 2 # 20 percent
else:
return 0
@staticmethod
def view_buy_convert(x, k):
"""
Get length of an array that is nested within a dictionary; this is for processing 'related' column
:param x: string to be parsed into dictionary
:param k: key into dictionary to access the array
:return: length of array
"""
dx = literal_eval(x)
if dx.get(k) is not None:
return len(dx.get(k))
return 0
@staticmethod
def weigh_review_score(row):
"""
Returns the product of review-binary prediction using only reviewText, multiplied by helpful percent
:param row: current row to perform multiplication
:return: weighted score
"""
return row['reviewTextScore'] * row['helpful']
@staticmethod
def weigh_summary_score(row):
"""
Returns the product of review-binary prediction using only summary, multiplied by helpful percent
:param row: current row to perform multiplication
:return: weighted score
"""
return row['summaryTextScore'] * row['helpful']
@staticmethod
def get_helpful_multiplier(x):
"""
Returns a multiplier that corresponds to whether ratio of helpful ratings to total ratings is at least 0.48
:param x: the string of form "[num1, num2]" to be parsed by literal_eval
:return: 2 if denominator is 0 (from observations) or if ratio is at least 0.48, otherwise 1
"""
x = literal_eval(x)
if x[1] == 0:
return 2
else:
if x[0] / x[1] >= 0.48: # 0.48 is a tuned value
return 2
return 1
@staticmethod
def review_binary(x):
"""
Returns boolean value for column review-binary, which is whether review > 4 stars
:param x: current row in DataFrame
:return: modified row with review-binary
"""
x['review-binary'] = x['overall'] > 4
return x
@staticmethod
def binarize_root_genre(x):
"""
Manually categorized root-genre based on observations in data exploration
:param x: DataFrame to modify
:return: new DataFrame
"""
cleanup_nums = {"root-genre": {"Pop": 0, "Rock": 0, "Classical": 0, "Latin Music": 0,
"Country": 1, "Jazz": 0, "Dance & Electronic": 0, "Alternative Rock": 0,
"New Age": 0,
"Rap & Hip-Hop": 0, "Metal": 0, "Folk": 1, "R&B": 0, "Blues": 1, "Gospel": 1,
"Reggae": 0}}
x = x.replace(cleanup_nums)
return x
@staticmethod
def cat_music(x):
"""
Categorize music, which is simply increasing integers, which serve merely as a placeholder for their
corresponding strings
:param x: row to modify
:return: modified row
"""
x["categories"] = x["categories"].astype('category')
x["categories"] = x["categories"].cat.codes
return x
@staticmethod
def get_review_year(x):
"""
Get review year for each review
:param x: DataFrame to modify
:return: modified DataFrame
"""
x['year'] = x['reviewTime'].apply(DataExploration.review_year_convert)
return x
@staticmethod
def unix_helpful_multiplier(train_data):
"""
Combining unixReviewTime with helpful multiplier to explore possible model improvements
:param train_data: DataFrame to modify
:return: modified DataFrame
"""
train_data['unix_help_multiplier'] = np.log(train_data['unixReviewTime']) * train_data['helpful'].apply(
DataPreprocessing.get_helpful_multiplier)
return train_data
@staticmethod
def unix_helpful_percent(x):
"""
Combining unixReviewTime with helpful percent to explore possible model improvements
:param x: DataFrame to modify
:return: modified DataFrame
"""
x['unix_help_percent'] = np.log(x['unixReviewTime']) * x['helpful'].apply(DataExploration.get_helpful_percent)
return x
@staticmethod
def categorize_unix_helpful_percent(x):
"""
Categorize values in unix_help_percent column into different 'bins'
:param x: DataFrame to modify
:return: modified DataFrame
"""
x['unix_help_percent_cat'] = x['unix_help_percent'].apply(DataPreprocessing.categorize_helper)
return x
@staticmethod
def buy_after_viewing(x):
"""
Get length of buy_after_viewing list in related column
:param x: DataFrame to modify
:return: modified DataFrame
"""
x['view_buy'] = x['related'].apply(lambda x: DataPreprocessing.view_buy_convert(x, 'buy_after_viewing'))
return x
@staticmethod
def binarize_feature(local_train, local_test, column, ratio):
"""
Counts the percentages of >4.5 reviews of each value in local_train[column]
Those with percentages above the ratio are given 1, others 0.
Examples of column can be 'label' or 'categories'
:param local_train: train DataFrame
:param local_test: test DataFrame
:param column: column name, which is string
:param ratio: threshold for binarizing
:return: modified train and test DataFrames
"""
threshold = ratio / (ratio + 1)
good_feat = pd.DataFrame(local_train.groupby(column)['target'].mean() > threshold)
local_train[column] = local_train[column].apply(lambda x: good_feat.loc[x])
local_test[column] = local_test[column].apply(DataPreprocessing.binarize_helper, args=(good_feat,))
return local_train, local_test
@staticmethod
def binarize_helper(x, good_feat):
"""
Helper function for binarize_feature
:param x: row to modify
:param good_feat: DataFrame of 'good' features i.e. percentage is above ratio
:return: result or False
"""
if x in good_feat.index:
return good_feat.loc[x, 'target']
return False
@staticmethod
def custom_token(text):
"""
Custom tokenizer for TF-IDF; results were not better when we tried using this
:param text: text to apply the tokenizer on; TF-IDF handles calling and passing text
:return: tokenized text
"""
return re.sub(r'[^a-zA-Z]', ' ', text).split()
@staticmethod
def wn_lemmatize(text):
"""
Wordnet lemmatizer/tokenizer for TF-IDF; results were not better
:param text: text to apply the tokenizer on; TF-IDF handles calling and passing text
:return: tokenized and lemmatized text with non-alpha characters removed
"""
text = re.sub(r'\W', ' ', text)
lm = WordNetLemmatizer()
return [lm.lemmatize(word) for word in nltk.word_tokenize(text)]
@staticmethod
def get_text_blob_sentiment(df):
"""
Get sentiment analysis using TextBlob library; modifies the DataFrame directly
TextBlob has 2 components to sentiment analysis: polarity and subjectivity
:param df: DataFrame whose 'reviewText' and 'summary' columns we want to apply TextBlob to
"""
# get scores for reviewText only
df['review-polarity'] = df['reviewText'].apply(lambda s: TextBlob(s).sentiment.polarity)
df['review-subjectivity'] = df['reviewText'].apply(lambda s: TextBlob(s).sentiment.subjectivity)
# get scores for summary only
df['summary-polarity'] = df['summary'].apply(lambda s: TextBlob(s).sentiment.polarity)
df['summary-subjectivity'] = df['summary'].apply(lambda s: TextBlob(s).sentiment.subjectivity)
# get scores for summary concatenated with reviewText
df['compound-polarity'] = (df['summary'] + ' ' + df['reviewText']).apply(lambda s: TextBlob(s).sentiment.
polarity)
df['compound-subjectivity'] = (df['summary'] + ' ' + df['reviewText']).apply(lambda s: TextBlob(s).sentiment.
subjectivity)