Skip to content

Commit

Permalink
999999999_268072.py (#42): started brazilian IBGE automated importer
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed May 16, 2022
1 parent 449ed01 commit b130339
Show file tree
Hide file tree
Showing 2 changed files with 368 additions and 2 deletions.
4 changes: 2 additions & 2 deletions officinam/999999999/0/999999999_10263485.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# LICENSE: Public Domain dedication or Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0.0
# CREATED: 2022-05-16 16:29 UTC
# CREATED: 2022-05-16 16:29 UTC based on hotfix0s.py
# REVISION: ---
# ==============================================================================

Expand Down Expand Up @@ -114,7 +114,7 @@ def make_args(self, hxl_output=True):

parser.add_argument(
'infile',
help='Input file',
help='Arquivo de entrada',
nargs='?'
)

Expand Down
366 changes: 366 additions & 0 deletions officinam/999999999/0/999999999_268072.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
#!/usr/bin/env python3
# ==============================================================================
#
# FILE: 999999999_268072.py
#
# USAGE: ./999999999/0/999999999_268072.py
# ./999999999/0/999999999_268072.py --help
#
# DESCRIPTION: RUn /999999999/0/999999999_268072.py --help
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: ---
# COMPANY: EticaAI
# LICENSE: Public Domain dedication or Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0.0
# CREATED: 2022-05-16 21:16 UTC based on 999999999_10263485.py
# REVISION: ---
# ==============================================================================

import sys
import argparse
import csv
import re

import json
import urllib.request

STDIN = sys.stdin.buffer

DESCRIPTION = """
{0} Processamento de dados de referência do IBGE (Brasil).
@see https://github.com/EticaAI/lexicographi-sine-finibus/issues/42
@see https://servicodados.ibge.gov.br/api/docs
Trivia:
- Q268072, https://www.wikidata.org/wiki/Q268072
- IBGE - Instituto Brasileiro de Geografia e Estatística
- "instituto público da administração federal brasileira criado em 1934
e instalado em 1936 com o nome de Instituto Nacional de Estatística (...)"
""".format(__file__)

__EPILOGUM__ = """
------------------------------------------------------------------------------
EXEMPLŌRUM GRATIĀ
------------------------------------------------------------------------------
{0} --methodus=ibge-un-adm2 --objectivum-formato=url-fonti
{0} --methodus=ibge-un-adm2 --objectivum-formato=json-fonti
{0} --methodus=ibge-un-adm2 --objectivum-formato=csv
{0} --methodus=ibge-un-adm2 --objectivum-formato=json-fonti-formoso \
> 999999/0/ibge-un-adm2.json
@TODO: fazer funcionar com stream de JSON (não apenas por URI)
------------------------------------------------------------------------------
EXEMPLŌRUM GRATIĀ
------------------------------------------------------------------------------
""".format(__file__)

LIKELY_NUMERIC = [
'#item+conceptum+codicem',
'#status+conceptum',
'#item+rem+i_qcc+is_zxxx+ix_n1603',
'#item+rem+i_qcc+is_zxxx+ix_iso5218',
]
# https://en.wiktionary.org/wiki/tabula#Latin
XML_AD_CSV_TABULAE = {
'CO_UNIDADE': 'CO_UNIDADE',
'NO_FANTASIA': 'NO_FANTASIA',
'CO_MUNICIPIO_GESTOR': 'CO_MUNICIPIO_GESTOR',
'NU_CNPJ': 'NU_CNPJ',
'CO_CNES': 'CO_CNES',
'DT_ATUALIZACAO': 'DT_ATUALIZACAO',
'TP_UNIDADE': 'TP_UNIDADE',
}

CSV_AD_HXLTM_TABULAE = {
# @TODO: create wikiq
'CO_UNIDADE': '#item+rem+i_qcc+is_zxxx+ix_brcnae',
'NO_FANTASIA': '#meta+NO_FANTASIA',
'CO_MUNICIPIO_GESTOR': '#item+rem+i_qcc+is_zxxx+ix_wikip1585',
'NU_CNPJ': '#item+rem+i_qcc+is_zxxx+ix_wikip6204',
'CO_CNES': '#meta+CO_CNES',
'DT_ATUALIZACAO': '#meta+DT_ATUALIZACAO',
'TP_UNIDADE': '#meta+TP_UNIDADE',
}


METHODUS_FONTI = {
'ibge-un-adm2': 'https://servicodados.ibge.gov.br/api' +
'/v1/localidades/distritos?view=nivelado&oorderBy=id'
}

# @TODO implementar malhas https://servicodados.ibge.gov.br/api/docs/malhas?versao=3
# ./999999999/0/999999999_268072.py 999999/0/1603_1_1--old.csv 999999/0/1603_1_1--new.csv


class Cli:

EXIT_OK = 0
EXIT_ERROR = 1
EXIT_SYNTAX = 2

def __init__(self):
"""
Constructs all the necessary attributes for the Cli object.
"""

def make_args(self, hxl_output=True):
# parser = argparse.ArgumentParser(description=DESCRIPTION)
parser = argparse.ArgumentParser(
prog="999999999_268072",
description=DESCRIPTION,
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__EPILOGUM__
)

parser.add_argument(
'infile',
help='Input file',
# required=False,
nargs='?'
)

parser.add_argument(
'--methodus',
help='Modo de operação.',
dest='methodus',
nargs='?',
choices=[
'ibge-un-adm2',
# 'data-apothecae',
# 'hxltm-explanationi',
# 'opus-temporibus',
# 'status-quo',
# 'deprecatum-dictionaria-numerordinatio'
],
# required=True
default='ibge-un-adm2'
)
# https://servicodados.ibge.gov.br/api/v1/localidades/distritos?view=nivelado&oorderBy=id

# objectīvum, n, s, nominativus,
# https://en.wiktionary.org/wiki/objectivus#Latin
# fōrmātō, n, s, dativus, https://en.wiktionary.org/wiki/formatus#Latin
parser.add_argument(
'--objectivum-formato',
help='Formato do arquivo exportado',
dest='objectivum_formato',
nargs='?',
choices=[
'url-fonti',
'json-fonti',
'json-fonti-formoso',
'csv',
'tsv',
'hxltm-csv',
'hxltm-tsv',
],
# required=True
default='csv'
)

# parser.add_argument(
# 'outfile',
# help='Output file',
# nargs='?'
# )

return parser.parse_args()

def execute_cli(self, pyargs, stdin=STDIN, stdout=sys.stdout,
stderr=sys.stderr):
# self.pyargs = pyargs

_infile = None
_stdin = None

if stdin.isatty():
# print("ERROR. Please pipe data in. \nExample:\n"
# " cat data.txt | {0} --actionem-quod-sparql\n"
# " printf \"Q1065\\nQ82151\\n\" | {0} --actionem-quod-sparql"
# "".format(__file__))
# print('non stdin')
_infile = pyargs.infile
# return self.EXIT_ERROR
else:
# print('est stdin')
_stdin = stdin

# print(pyargs.objectivum_formato)
# print(pyargs)

if _stdin is not None:
for line in sys.stdin:
# print('oi')
codicem = line.replace('\n', ' ').replace('\r', '')

# hf = CliMain(self.pyargs.infile, self.pyargs.outfile)
climain = CliMain(infile=_infile, stdin=_stdin,
objectivum_formato=pyargs.objectivum_formato)

if pyargs.objectivum_formato == 'url-fonti':
print(METHODUS_FONTI[pyargs.methodus])
return self.EXIT_OK

if pyargs.objectivum_formato == 'json-fonti':
return climain.json_fonti(METHODUS_FONTI[pyargs.methodus])
if pyargs.objectivum_formato == 'json-fonti-formoso':
return climain.json_fonti(METHODUS_FONTI[pyargs.methodus], True)

if pyargs.methodus == 'ibge-un-adm2':
return climain.execute_ex_datasus_xmlcnae()
# if pyargs.methodus == 'datasus-xmlcnae':
# return climain.execute_ex_datasus_xmlcnae()

print('Unknow option.')
return self.EXIT_ERROR


class CliMain:
"""Remove .0 at the end of CSVs from data exported from XLSX and likely
to have numeric values (and trigger weird bugs)
"""

def __init__(self, infile: str = None, stdin=None,
objectivum_formato: str = 'hxltm-csv'):
"""
Constructs all the necessary attributes for the Cli object.
"""
self.infile = infile
self.stdin = stdin
self.objectivum_formato = objectivum_formato

# self.outfile = outfile
self.header = []
self.header_index_fix = []

def json_fonti(self, uri: str, formosum: bool = False) -> str:

data = urllib.request.urlopen(uri).read()
output = json.loads(data)
if formosum:
print(json.dumps(output,
indent=2, ensure_ascii=False, sort_keys=False))
return Cli.EXIT_OK
print(output)
return Cli.EXIT_OK
# return ''

def process_row(self, row: list) -> list:
if len(self.header) == 0:
if row[0].strip().startswith('#'):
self.header = row
for index, item in enumerate(self.header):
item_norm = item.strip().replace(" ", "")
for likely in LIKELY_NUMERIC:
# print(item_norm, likely)
if item_norm.startswith(likely):
self.header_index_fix.append(index)
# print('oi header', self.header_index_fix, self.header)
else:
for index_fix in self.header_index_fix:
row[index_fix] = re.sub('\.0$', '', row[index_fix].strip())
return row

def execute_ex_datasus_xmlcnae(self):
# print('@TODO copy logic from https://github.com/EticaAI/hxltm/blob/main/bin/hxltmdexml.py')

_source = self.infile if self.infile is not None else self.stdin
delimiter = ','
if self.objectivum_formato in ['tsv', 'hxltm-tsv']:
delimiter = "\t"
objectivum = csv.writer(
sys.stdout, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL)

# self.iteratianem = XMLElementTree.iterparse(
iteratianem = XMLElementTree.iterparse(
# source=self.fontem_archivum,
# source=self.infile,
source=_source,
events=('start', 'end')
# events=('end')
)

# print(iteratianem)

# for item in iteratianem:
# print(item)
# print(item.text)

# for event, elem in ET.iterparse(file_path, events=("start", "end")):

caput = []
caput22 = []
caput_okay = False
for event, elem in iteratianem:
# if event == 'start':
# path.append(elem.tag)
# pass
# elif event == 'end':
# print(elem.tag)
# return 1
if event == 'end':
# print(elem)
if elem.tag.upper() != 'ROW':
continue
if hasattr(elem, 'attrib'):
lineam = []

for clavem, res in elem.attrib.items():
if caput_okay is False:
caput.append(clavem)
# caput22.append(clavem)
lineam.append(res)

if caput_okay is False and len(caput) > 0:
# if 'CO_CNES' in caput:
caput_okay = True
# print('OIOI', caput, caput22)
objectivum.writerow(caput)
if len(lineam) > 0:
objectivum.writerow(lineam)
# process the tag
# if elem.tag == 'name':
# if 'members' in path:
# print 'member'
# else:
# print 'nonmember'
# path.pop()

return Cli.EXIT_OK
with open(self.infile, newline='') as infilecsv:
with open(self.outfile, 'w', newline='') as outfilecsv:
spamreader = csv.reader(infilecsv)
spamwriter = csv.writer(outfilecsv)
for row in spamreader:
# spamwriter.writerow(row)
spamwriter.writerow(self.process_row(row))
# self.data.append(row)

def execute(self):
with open(self.infile, newline='') as infilecsv:
with open(self.outfile, 'w', newline='') as outfilecsv:
spamreader = csv.reader(infilecsv)
spamwriter = csv.writer(outfilecsv)
for row in spamreader:
# spamwriter.writerow(row)
spamwriter.writerow(self.process_row(row))
# self.data.append(row)


if __name__ == "__main__":

est_cli = Cli()
args = est_cli.make_args()

est_cli.execute_cli(args)

0 comments on commit b130339

Please sign in to comment.