-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_text_column.py
executable file
·91 lines (66 loc) · 3.27 KB
/
parse_text_column.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
from pandas import ExcelWriter
import numpy as np
import csv
import matplotlib.pyplot as plt
import math
DATAFILENAME = "./data/The use of software in research (Responses) 24 Oct 14 - Form Responses 1.csv"
STOREFILENAME = "./data/software_in_research_parasable.csv"
def import_csv_to_df(filename):
"""
Imports a csv file into a Pandas dataframe
:params: get an xls file and a sheetname from that file
:return: a df
"""
return pd.read_csv(filename)
def export_df_to_csv(df, filename):
"""
Saves a df as a
:params: get an xls file and a sheetname from that file
:return: a df
"""
return df.to_csv(filename)
def separate_software_packages(df):
"""
Separates the rather messy user-entered free text for which software packages people use
I replace semi-colons and carriage returns with commas. Removes superfluous info, such as brackets
and text within them (these are typically comments and not software packages) and URLs. The
remaining strings are very time-consuming and difficult to parse. However, one-word answers
are easy to parse. Hence, I locate these. The location of all parsable strings is recorded
in a new column and the non-parsable strings are over-written with NaN.
:params: a df
:return: a df with a column of clean strings and a new column saying which columns are parsable
"""
# Original colname is really long, so using this to shorten it
colname = 'Question 11: Please provide the name(s) of the main research software you use.'
# Things to replace with a comma
char_replacees = [';', '\n', 'and', r',+']
# Things to replace with a space
regex_replacees = [r'\(.*\)', r'\".*\"', r'^https?:\/\/.*[\r\n]*']
for current in char_replacees:
df[colname] = df[colname].str.replace(current,',')
print('replacing')
for current in regex_replacees:
df[colname] = df[colname].str.replace(current,'')
# Store the location of the parsable strings (i.e. the ones with commas). It adds a True or False dependent
# on whether it finds a comma in the appropriate row
df['Q11_valid_data'] = df['Question 11: Please provide the name(s) of the main research software you use.'].str.contains(',')
# Add to the parsable strings by also storing the location of any one-word strings (which are likely to be the name of a single software package). If it's already populated, you use the already populated data.
df['Q11_valid_data'] = (df['Q11_valid_data']) | (df['Question 11: Please provide the name(s) of the main research software you use.'].str.count(' ') == 0)
# Anything that's not in 'Q11_valid_data' is likely to be too difficult to parse, so kill it all!
df.loc[df['Q11_valid_data'] == False, 'Question 11: Please provide the name(s) of the main research software you use.'] = np.nan
print('The dataframe contains the following number of parsable rows:')
print(df['Q11_valid_data'].value_counts())
return
def main():
"""
Main function to run program
"""
# Read survey data from csv
df = import_csv_to_df(DATAFILENAME)
separate_software_packages(df)
export_df_to_csv(df, STOREFILENAME)
if __name__ == '__main__':
main()