-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
data_cleaning8.py
60 lines (40 loc) · 1.58 KB
/
data_cleaning8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
# Remove Out-of-Range Data
# create boolean Series for out of range donations
out_of_range = data_frames['donation'] < 0
# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]
# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
print(data_frames_no_missing.head(20))