From 8dc806d6a29c16077d73ca77f4641b52d187b909 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Sun, 23 Jan 2022 14:46:20 -0300 Subject: [PATCH] 1603:1:51 (#9): HXLTM export on " --- officinam/999999999/0/1603_3_12.py | 75 +++++++++++++++++++++++-- officinam/999999999/1603_3_1603_45_1.sh | 1 + officinam/999999999/999999999.lib.sh | 9 +-- 3 files changed, 77 insertions(+), 8 deletions(-) diff --git a/officinam/999999999/0/1603_3_12.py b/officinam/999999999/0/1603_3_12.py index f4a17bf..ff5fda6 100755 --- a/officinam/999999999/0/1603_3_12.py +++ b/officinam/999999999/0/1603_3_12.py @@ -35,6 +35,7 @@ # printf "Q1065\nQ82151\n" | ./999999999/0/1603_3_12.py --actionem-sparql --query | ./999999999/0/1603_3_12.py --actionem-sparql --wikidata-link # printf "Q1065\nQ82151\n" | ./999999999/0/1603_3_12.py --actionem-sparql --query | ./999999999/0/1603_3_12.py --actionem-sparql --tsv > 999999/0/test.tsv # printf "Q1065\nQ82151\n" | ./999999999/0/1603_3_12.py --actionem-sparql --query | ./999999999/0/1603_3_12.py --actionem-sparql --csv > 999999/0/test.csv +# printf "Q1065\nQ82151\n" | ./999999999/0/1603_3_12.py --actionem-sparql --query | ./999999999/0/1603_3_12.py --actionem-sparql --csv --hxltm # TODO: https://sinaahmadi.github.io/posts/10-essential-sparql-queries-for-lexicographical-data-on-wikidata.html @@ -92,7 +93,32 @@ # SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # } + +def hxltm_hastag_de_csvhxlated(csv_caput: list) -> list: + """hxltm_hastag_de_csvhxlated [summary] + + Make this type of conversion: + - 'item__conceptum__codicem' => '#item+conceptum+codicem' + - 'item__rem__i_ara__is_arab' => '#item+rem+i_ara+is_arab' + - '' => '' + + Args: + csv_caput (list): Array of input items + + Returns: + [list]: + """ + resultatum = [] + for item in csv_caput: + if len(item): + resultatum.append('#' + item.replace('__', '+').replace('?', '')) + else: + resultatum.append('') + return resultatum + # https://stackoverflow.com/questions/43258341/how-to-get-wikidata-labels-in-more-than-one-language + + class CS1603z3z12: """ [summary] @@ -194,12 +220,11 @@ def est_wikidata_q(self, wikidata_codicem: str): # } # } - def query(self): qid = ['wd:' + x for x in self.qid if isinstance(x, str)] # select = '?item ' + " ".join(self._query_linguam()) - select = ['?item'] + select = ['(?item AS ?item__conceptum__codicem)'] filter_otional = [] for pair in self.D1613_1_51_langpair: select.append('?' + pair[1]) @@ -325,6 +350,7 @@ def make_args(self, hxl_output=True): const=True, nargs='?' ) + neo_codex.add_argument( '--tsv', help='Generate TSV output (from piped in query)', @@ -334,6 +360,17 @@ def make_args(self, hxl_output=True): nargs='?' ) + neo_codex.add_argument( + '--hxltm', + help='Generate HXL-tagged output (from piped in query). ' + + 'Concepts use #item+conceptum+codicem instead ' + + 'of #item+code+v_wiki_q', + metavar='', + dest='hxltm', + const=True, + nargs='?' + ) + # neo_codex.add_argument( # '--actionem-verbum-simplex', # help='Do not generate the codes. Just calculate the full matrix ' + @@ -624,16 +661,46 @@ def execute_cli(self, pyargs, stdin=STDIN, stdout=sys.stdout, # https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/en#Supported_formats if self.pyargs.tsv: + separator = "\t" headers = {'Accept': 'text/tab-separated-values'} if self.pyargs.csv: + separator = "," + headers = {'Accept': 'text/csv'} + if self.pyargs.hxltm: + # headers = {'Accept': 'text/tab-separated-values'} headers = {'Accept': 'text/csv'} payload_query = "".join(full_query) - r = requests.get(sparql_backend, headers=headers, params={ + r = requests.post(sparql_backend, headers=headers, data={ 'query': payload_query }) - print(r.text.strip()) + # @TODO: --tsv --hxltm is know to be bugged (not sure if + # Wikidata result already skip values) + + if self.pyargs.hxltm: + result_string = r.text.strip() + + # @TODO: this likely to break with fields with newlines. + # however no testing sample exists at the moment. + # Eventually needs be checked. + lines = result_string.splitlines() + # caput = hxltm_hastag_de_csvhxlated(next(iter(lines)).split(",")) + caput_crudum = lines.pop(0) + # print('caput_crudum', caput_crudum) + caput = hxltm_hastag_de_csvhxlated(caput_crudum.split(',')) + print(separator.join(caput)) + print("\n".join(lines)) + + # reader = csv.reader(lines, delimiter="\t") + # caput = hxltm_hastag_de_csvhxlated(next(reader)) + # print(separator.join(caput)) + # for row in reader: + # print(separator.join(row)) + else: + print(r.text.strip()) + + # TODO: generate explicit error messages and return code # print(r.content) return self.EXIT_OK diff --git a/officinam/999999999/1603_3_1603_45_1.sh b/officinam/999999999/1603_3_1603_45_1.sh index f0277dc..38d50ac 100755 --- a/officinam/999999999/1603_3_1603_45_1.sh +++ b/officinam/999999999/1603_3_1603_45_1.sh @@ -5,6 +5,7 @@ # # USAGE: ./999999999/1603_3_1603_45_1.sh # time ./999999999/1603_3_1603_45_1.sh +# time FORCE_CHANGED=1 ./999999999/1603_3_1603_45_1.sh # # DESCRIPTION: --- # diff --git a/officinam/999999999/999999999.lib.sh b/officinam/999999999/999999999.lib.sh index 46d537e..7822465 100644 --- a/officinam/999999999/999999999.lib.sh +++ b/officinam/999999999/999999999.lib.sh @@ -127,6 +127,7 @@ file_update_if_necessary() { csv) is_valid=$(csvclean --dry-run "$fontem_archivum") if [ "$is_valid" != "No errors." ]; then + echo "$fontem_archivum" echo "$is_valid" return 1 fi @@ -393,17 +394,17 @@ file_translate_csv_de_numerordinatio_q() { fi fontem_archivum="${_basim_fontem}/$_path/$_nomen.no1.tm.hxl.csv" - objectivum_archivum="${_basim_objectivum}/$_path/$_nomen.wikiq.tm.csv" + objectivum_archivum="${_basim_objectivum}/$_path/$_nomen.wikiq.tm.hxl.csv" objectivum_archivum_temporarium="${ROOTDIR}/999999/0/$_nomen.no1.tm.hxl.csv" objectivum_archivum_temporarium_b="${ROOTDIR}/999999/0/$_nomen.q.txt" objectivum_archivum_temporarium_b_u="${ROOTDIR}/999999/0/$_nomen.uniq.q.txt" - objectivum_archivum_temporarium_b_u_wiki="${ROOTDIR}/999999/0/$_nomen.wikiq.tm.csv" + objectivum_archivum_temporarium_b_u_wiki="${ROOTDIR}/999999/0/$_nomen.wikiq.tm.hxl.csv" # if [ -z "$(changed_recently "$fontem_archivum")" ]; then return 0; fi # echo "${FUNCNAME[0]} sources changed_recently. Reloading..." - if [ -z "$(stale_archive "$objectivum_archivum")" ]; then return 0; fi + # if [ -z "$(stale_archive "$objectivum_archivum")" ]; then return 0; fi echo "${FUNCNAME[0]} stale data on [$objectivum_archivum], refreshing..." @@ -445,7 +446,7 @@ file_translate_csv_de_numerordinatio_q() { sort --version-sort --field-separator="Q" <"$objectivum_archivum_temporarium_b" | uniq >"$objectivum_archivum_temporarium_b_u" "${ROOTDIR}/999999999/0/1603_3_12.py" --actionem-sparql --query <"$objectivum_archivum_temporarium_b_u" | - ./999999999/0/1603_3_12.py --actionem-sparql --csv \ + ./999999999/0/1603_3_12.py --actionem-sparql --csv --hxltm \ >"$objectivum_archivum_temporarium_b_u_wiki" # "$objectivum_archivum_temporarium_b_u"