-
Notifications
You must be signed in to change notification settings - Fork 0
/
getix.py
272 lines (206 loc) · 9.99 KB
/
getix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python
'''
getix.py
This script is an in-progress effort to make it easy to parse iXBRL files and generate spreadsheet output (Excel and CSV).
This script will soon be phased out in favor of cafr_excel.py.
Resources
- iXBRL spec: http://www.xbrl.org/specification/inlinexbrl-part1/rec-2013-11-18/inlinexbrl-part1-rec-2013-11-18.html
- [iXBRL schema](http://www.xbrl.org/specification/inlinexbrl-part2/rec-2013-11-18/inlinexbrl-part2-rec-2013-11-18.html)
- [iXBRL primer](http://www.xbrl.org/WGN/inlineXBRL-part0/WGN-2015-12-09/inlineXBRL-part0-WGN-2015-12-09.html)
- [XBRL - Wikipedia](https://en.wikipedia.org/wiki/XBRL)
Libraries
- BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
'''
urls = ['https://xbrlus.github.io/cafr/samples/3/Alexandria-2018-Statements.htm',
'https://xbrlus.github.io/cafr/samples/4/FallsChurch-2018-Statements.htm',
'https://xbrlus.github.io/cafr/samples/5/Loudoun-2018-Statements.htm',
'https://xbrlus.github.io/cafr/samples/6/ga-20190116.htm',
'https://xbrlus.github.io/cafr/samples/1/StPete_StmtNetPos_iXBRL_20190116.htm',
'https://xbrlus.github.io/cafr/samples/2/VABeach_StmtNetPos_iXBRL_20190116.htm',
'https://xbrlus.github.io/cafr/samples/7/ut-20190117.htm']
# In[244]:
import re
import requests
from bs4 import BeautifulSoup
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import datetime
# In Python 3.7, dict is automatically ordered, but to allow for people using previous versions,
# need to use an OrderedDict or the results will be messy.
from collections import OrderedDict
from ixbrl import Criterion, XbrliDocument
class InputCriteria:
''' Represents input criteria for an output field from config.csv. '''
# cafr:CashAndCashEquivalents (cafr:AccrualBasisOfAccountingMember cafr:GovernmentalTypeActivityMember cafr:PrimaryGovernmentActivitiesMember)
regex = re.compile(r'(.*?)\s*\((.*?)\)')
def __init__(self, text):
result = self.regex.search(text)
try:
self.name = result.group(1)
self.required_members = result.group(2).split()
except:
# Some criteria, such as cafr:DocumentName, don't have context info.
self.name = text
self.required_members = []
def matches_element(self, element):
try:
if element.name != self.name:
return False
except:
return False
# Fails if no formal context definition was provided, in which case we just try to match the contextref attribute.
try:
context_members = element.context.explicit_members
except:
context_members = [element.tag['contextref']]
for member in self.required_members:
if member not in context_members:
return False
return True
class SummarySpreadsheet:
def __init__(self, paths = [], urls = [], config_path = 'config.csv'):
self.paths = paths
self.urls = urls
self.config_path = config_path
self.docs = []
# Load all specified documents.
for path in paths:
print(f'Loading {path}...')
doc = XbrliDocument(path=path)
self.docs.append(doc)
for url in urls:
print(f'Downloading {url}...')
try:
doc = XbrliDocument(url=url)
self.docs.append(doc)
except:
pass
def to_csv(self, path='output.csv'):
self.dataframe.to_csv(path, index=False)
def to_excel(self, path='output.xlsx', number_format='#,##0', col_width=45, freeze_cols=3):
# To have numbers not be treated as strings in the Excel file, have to specify the type of the column.
# Easy approach is to just try turning each column into a numeric column and see if it works
# (it will fail if any value is not a number).
df = self.dataframe
num_cols = len(self.output_fields)
num_rows = len(df)
for col in df.columns:
try:
df[col] = self._to_numeric(df[col])
except:
pass
# Add number formatting to the Excel file.
# https://xlsxwriter.readthedocs.io/example_pandas_column_formats.html
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(path, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Add at the right the following calculation:
# General Fund Balance / General Fund Expenditure with the title General Fund Balance Ratio.
# The formula would be R2 / T2 where 2 is replaced by the row number.
col_index = num_cols
header_format = workbook.add_format({'align': 'center', 'bold': True, 'bg_color':'yellow', 'bottom':True, 'left':True, 'right':True})
worksheet.write(0, col_index, 'General Fund Balance Ratio', header_format)
formula_format = workbook.add_format({'bg_color':'yellow', 'num_format': '0.00'})
for row_index in range(1,num_rows+1):
# Excel indexes are 1-based.
worksheet.write(row_index, col_index, f'=R{row_index+1} / T{row_index+1}', formula_format)
# Apply column width and number format to all columns.
num_format = workbook.add_format({'num_format': number_format})
worksheet.set_column(0, num_cols, col_width, num_format)
# Freeze the specified number of columns.
if freeze_cols:
worksheet.freeze_panes(0, freeze_cols)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
@property
def dataframe(self):
# Build up data dictionary to become DataFrame.
sheet_data = OrderedDict()
# For each output field, go through each doc and get the value based on input fields
for output_name, inputs in self.output_fields.items():
values = []
for doc in self.docs:
# NOW:
# The criteria may match more than one element in the document. In that case,
# if the matching elements have a date in their contexts, choose the most recent date.
# Otherwise choose the last element found.
for criteria in inputs:
elements_found = []
for element in doc.ix_elements:
if criteria.matches_element(element):
elements_found.append(element)
if elements_found:
# Sort elements found by the context date (if any) and take the most recent date.
# If no context dates, this means returning the last element found.
elements_found.sort(key = lambda element: element.context.period)
values.append(elements_found[-1].string)
break
# If no value for this output field, need an empty value.
if not elements_found:
values.append('')
sheet_data[output_name] = values
return DataFrame(sheet_data)
@property
def output_fields(self):
'''
If the config CSV file exists, it is used to determine what to output.
If config file doesn't exist, uses all fields specified in the docs.
CSV Format:
Output Field Name,Input Field Name (minimum required contexts)
Document Title,cafr:DocumentTitle
The same Output Field Name can be used multiple times,
to allow multiple Input Field Names tied to the same output name.
In that case, only the first matching Input Field Name will be used for a document.
Returns a dictionary with the output field name as the key and a list of InputCriteria objects as the value.
'''
try:
return self._output_fields
except:
pass
self._output_fields = OrderedDict()
try:
df = pd.read_csv(self.config_path)
for output_name, input_name in df.itertuples(index=False):
inputs = self._output_fields.setdefault(output_name, [])
inputs.append(InputCriteria(input_name))
except FileNotFoundError:
for doc in self.docs:
for key in doc.ix_fields:
if key not in self._output_fields:
# The input names list in this case just uses the key name.
self._output_fields[key] = [key]
return self._output_fields
def _to_numeric(self, iterable, downcast='signed'):
''' Fixes up problems with converting strings to numbers, then uses pd.to_numeric() to do the conversion.
Raises exception if all values are not numeric. '''
converted = []
for item in iterable:
if isinstance(item, str):
# Numeric conversions can't handle commas.
item = item.replace(',', '')
# Empty string will fail to convert, so turn it into NotANumber...
if item == '': item = np.nan
converted.append(item)
return pd.to_numeric(converted, downcast=downcast)
# In[265]:
def main(paths=None):
''' For development, pass a list of paths and urls will be skipped. '''
if paths:
spreadsheet = SummarySpreadsheet(paths=paths)
else:
spreadsheet = SummarySpreadsheet(urls=urls)
spreadsheet.to_csv()
print('Generated output.csv')
spreadsheet.to_excel()
print('Generated output.xlsx')
# In[266]:
def test():
from pathlib import Path
paths = [str(path) for path in Path('test_data').iterdir() if '.xhtml' in str(path) or '.htm' in str(path)]
main(paths)
# In[267]:
#main()
test()