-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
data_cleaning5.py
41 lines (29 loc) · 1.01 KB
/
data_cleaning5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
print(data_frames.head(10))