Skip to content

Commit

Permalink
999999999_521850.py (#43): early draft of No1 (generic HXL from only …
Browse files Browse the repository at this point in the history
…tabula HXLTM)
  • Loading branch information
fititnt committed Jul 30, 2022
1 parent 8ea96f2 commit f5f1ecc
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 9 deletions.
118 changes: 112 additions & 6 deletions officina/999999999/0/999999999_521850.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from L999999999_0 import (
# hxltm_carricato,
NUMERORDINATIO_BASIM,
numerordinatio_neo_separatum,
# TabulaAdHXLTM
)

Expand Down Expand Up @@ -221,6 +222,11 @@
# '#item+rem+i_qcc+is_zxxx+ix_xywdatap1540': r"^#population\+m$",
}

DATA_NO1_DE_HXLTM_GENERIC = {
# '^(?P<t>#[a-z0-9]{3,99})\+rem\+i_qcc\+is_zxxx\+ix_xywdatap(?P<v2>[0-9]{1,12})'
'#{t}+rem+i_qcc+is_zxxx+ix_iso8601v{v1}+rdf_p_wdata_p{v2}_s{trivio}': r"^#(?P<t>[a-z0-9]{3,99})\+rem\+i_qcc\+is_zxxx\+ix_iso8601v(?P<v1>[0-9]{4})\+ix_xywdatap(?P<v2>[0-9]{1,12})"
}

DATA_HXL_DE_CSV_REGEX = {
# @see https://data.humdata.org/tools/hxl-example/
# @see https://data.worldbank.org/indicator
Expand Down Expand Up @@ -454,6 +460,24 @@ def make_args(self):
default=None
)

parser.add_argument(
'--numerordinatio-praefixo',
help='Numerordĭnātĭo prefix',
dest='numerordinatio_praefixo',
nargs='?',
default='999999:0'
)

parser.add_argument(
'--rdf-trivio',
help='(Advanced) RDF bag; extract triples from tabular data from '
'other groups than 1603',
dest='rdf_trivio',
nargs='?',
# required=True,
default='1603'
)

return parser.parse_args()

def execute_cli(self, pyargs, stdin=STDIN, stdout=sys.stdout,
Expand Down Expand Up @@ -531,7 +555,8 @@ def execute_cli(self, pyargs, stdin=STDIN, stdout=sys.stdout,
if pyargs.methodus_fonti == 'worldbank':
# print(DATA_SCRAPPING_HELP['WORLDBANK'])
ds_worldbank = DataScrappingWorldbank(
pyargs.methodus, pyargs.objectivum_formato)
pyargs.methodus, pyargs.objectivum_formato,
pyargs.numerordinatio_praefixo, pyargs.rdf_trivio)
ds_worldbank.praeparatio()
ds_worldbank.imprimere()
return self.EXIT_OK
Expand Down Expand Up @@ -663,9 +688,21 @@ def execute_cli(self, pyargs, stdin=STDIN, stdout=sys.stdout,

class DataScrapping:

def __init__(self, methodus: str, objectivum_formato: str):
def __init__(
self, methodus: str,
objectivum_formato: str,
numerordinatio_praefixo: str = '999999:0',
rdf_trivio: str = '1603'
):

self.methodus = methodus
if numerordinatio_praefixo:
self.numerordinatio_praefixo = numerordinatio_neo_separatum(
numerordinatio_praefixo, ':'
)
# if rdf_trivio:
self.rdf_trivio = rdf_trivio

self.objectivum_formato = objectivum_formato
self._temp = {}

Expand Down Expand Up @@ -751,20 +788,68 @@ def _hxltmize(self, caput: list):
)
return resultatum

def de_csv_ad_csvnorm(self, fonti: str, objetivum: str, caput_initiali: list):
def _no1lize(self, caput: list):
resultatum = []
for res in caput:
if not res:
resultatum.append('')
continue

_done = False
for _ht_novo, _ht_retest in DATA_NO1_DE_HXLTM_GENERIC.items():

if isinstance(_ht_retest, str) and _ht_retest == res:
resultatum.append(_ht_novo)
_done = True
break

_resultatum = re.match(_ht_retest, res)
if not _resultatum:
continue
_vars = _resultatum.groupdict()

# print(res, _resultatum, _vars)
if _vars and len(_vars.keys()) > 0:
_vars['trivio'] = self.rdf_trivio
resultatum.append(_ht_novo.format_map(_vars))
else:
resultatum.append(_ht_novo)
_done = True
break

if _done is True:
continue

# Let at is is
resultatum.append(res)

# resultatum.append(
# '#meta+rem+i_qcc+is_zxxx+{0}'.format(
# res.lower().strip().replace(
# ' ', '').replace('-', '_').replace(
# '#', '').replace('+', '_'))
# )
return resultatum

def de_csv_ad_csvnorm(
self, fonti: str, objetivum: str, caput_initiali: list):
# print("TODO de_csv_ad_csvnorm")
with open(objetivum, 'w') as _objetivum:
with open(fonti, 'r') as _fons:
_csv_reader = csv.reader(_fons)
_csv_writer = csv.writer(_objetivum)
started = False
strip_last = None
for linea in _csv_reader:
# print(linea)

if not started:
if linea and linea[0].strip() in caput_initiali:
started = True
strip_last = len(linea[-1]) == 0
else:
continue
if strip_last:
linea.pop()
_csv_writer.writerow(linea)

# print("TODO")
Expand All @@ -784,6 +869,8 @@ def de_csvnorm_ad_hxl(self, fonti: str, objetivum):

def de_hxl_ad_hxltm(self, fonti: str, objetivum: str):
# print("TODO de_csv_ad_csvnorm")
index_linea = 0
codicem_inconito = False
with open(objetivum, 'w') as _objetivum:
with open(fonti, 'r') as _fons:
_csv_reader = csv.reader(_fons)
Expand All @@ -792,12 +879,21 @@ def de_hxl_ad_hxltm(self, fonti: str, objetivum: str):
for linea in _csv_reader:
if not started:
started = True
_csv_writer.writerow(self._hxltmize(linea))
caput = self._hxltmize(linea)
if '#item+conceptum+codicem' not in caput:
codicem_inconito = True
caput.insert(0, '#item+conceptum+codicem')
_csv_writer.writerow(caput)
continue
if codicem_inconito is True:
index_linea += 1
linea.insert(0, str(index_linea))
_csv_writer.writerow(linea)

def de_hxltm_ad_no1(self, fonti: str, objetivum: str):
# print("TODO de_csv_ad_csvnorm")
numerordinatio_inconito = False
codicem_index = -1
with open(objetivum, 'w') as _objetivum:
with open(fonti, 'r') as _fons:
_csv_reader = csv.reader(_fons)
Expand All @@ -806,8 +902,17 @@ def de_hxltm_ad_no1(self, fonti: str, objetivum: str):
for linea in _csv_reader:
if not started:
started = True
_csv_writer.writerow(self._hxltmize(linea))
caput = self._no1lize(linea)
if '#item+conceptum+numerordinatio' not in caput:
numerordinatio_inconito = True
codicem_index = caput.index(
'#item+conceptum+codicem')
caput.insert(0, '#item+conceptum+numerordinatio')
_csv_writer.writerow(caput)
continue
if numerordinatio_inconito is True:
linea.insert(0, '{0}:{1}'.format(
self.numerordinatio_praefixo, linea[codicem_index]))
_csv_writer.writerow(linea)


Expand Down Expand Up @@ -1040,6 +1145,7 @@ def praeparatio(self):
class DataScrappingWorldbank(DataScrapping):

methodus: str = 'SP.POP.TOTL'
numerordinatio_praefixo: str = '999999:0'
objectivum_formato: str = 'csv'
# link_fonti: str = 'https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=excel'
link_fonti: str = 'https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv'
Expand Down
16 changes: 13 additions & 3 deletions officina/999999999/999999_17.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,25 @@ set -x
--methodus-fonti=worldbank \
--methodus="SP.POP.TOTL" \
--objectivum-formato=no1 \
--numerordinatio-praefixo="1603_992_1_0" \
>"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL.no1.tm.hxl.csv"


./999999999/0/999999999_54872.py \
--methodus=_temp_no1 \
--rdf-sine-spatia-nominalibus=devnull \
--rdf-sine-spatia-nominalibus=devnull,mdciii \
--rdf-trivio=1603 \
"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL.no1.tm.hxl.csv" \
>"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL.no1.owl.ttl"
>"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL~TEMP.no1.owl.ttl"


# @TODO fix the extra namespace when we use default 1603
rdfpipe --input-format=turtle --output-format=longturtle \
"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL~TEMP.no1.owl.ttl" \
"999999/0/1603_992_1_0~worldbank~SP_POP_TOTL.no1.owl.ttl"

# rdfpipe --input-format=turtle --output-format=longturtle 999999/0/1603_992_1_0~worldbank~SP_POP_TOTL~TEMP.no1.owl.ttl

# ./999999999/0/999999999_521850.py --methodus-fonti=worldbank --methodus=SP.POP.TOTL --objectivum-formato=no1 --rdf-trivio=12345 | hea

./999999999/0/999999999_521850.py \
--methodus-fonti=worldbank \
Expand Down

0 comments on commit f5f1ecc

Please sign in to comment.