-
Notifications
You must be signed in to change notification settings - Fork 25
/
data_cut_off.py
24 lines (18 loc) · 925 Bytes
/
data_cut_off.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
# def train_valid_test_split(x_data, y_data,
# validation_size=0.1, test_size=0.1, shuffle=True):
# x_, x_test, y_, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=shuffle)
# valid_size = validation_size / (1.0 - test_size)
# x_train, x_valid, y_train, y_valid = train_test_split(x_, y_, test_size=valid_size, shuffle=shuffle)
# return x_train, x_valid, x_test, y_train, y_valid, y_test
if __name__ == '__main__':
path = "glue/"
pd_all = pd.read_csv(os.path.join(path, "train.tsv"), sep='\t' )
pd_all = shuffle(pd_all)
dev_set = pd_all.iloc[0:pd_all.shape[0]/10]
train_set = pd_all.iloc[pd_all.shape[0]/10, -1]
dev_set.to_csv("glue/dev.tsv", index=False, sep='\t')
train_set.to_csv("glue/train.tsv", index=False, sep='\t')