From 50135f2d356247a8d55e4e56af130c25f00f8fc4 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Sat, 20 Nov 2021 04:28:35 -0300 Subject: [PATCH] #1, #4, #5: scripts/fn_tico19_datainfo_tmx.py started --- .github/workflows/tico-19-hxltm_etica-ai.yml | 2 +- .gitignore | 1 + scripts/_run-all-data-scripts.sh | 21 ------ scripts/_setup-local-machine.sh | 56 +++++++++++++++ scripts/data-info/tico19_tm.csv | 38 ++++++++++ ...ata-original-prepare-translation-memory.sh | 43 +++++++++++ scripts/fn_tico19_datainfo_tmx.py | 71 +++++++++++++++++++ 7 files changed, 210 insertions(+), 22 deletions(-) create mode 100755 scripts/_setup-local-machine.sh create mode 100644 scripts/data-info/tico19_tm.csv create mode 100755 scripts/fn_tico19_datainfo_tmx.py diff --git a/.github/workflows/tico-19-hxltm_etica-ai.yml b/.github/workflows/tico-19-hxltm_etica-ai.yml index 8d55822..db08457 100644 --- a/.github/workflows/tico-19-hxltm_etica-ai.yml +++ b/.github/workflows/tico-19-hxltm_etica-ai.yml @@ -175,7 +175,7 @@ jobs: # bundle exec asciidoctor-pdf --attribute allow-uri-read=1 docs/eng-Latn/index.adoc # bundle exec asciidoctor-epub --attribute allow-uri-read=1 docs/eng-Latn/index.adoc - # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc -out-file docs/tico-19-hxltm_eng-Latn.pdf + # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf # bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 docs/eng-Latn/draft-eng-Latn.adoc diff --git a/.gitignore b/.gitignore index 8ae8f99..d3149f8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ data/tico-19-terminology-facebook.tm2.hxl.csv # temporary data/original/translation-memory/translators-without-borders/all/*.tmx data/original/translation-memory/translators-without-borders/all/*.zip +data/original/translation-memory/translators-without-borders/all_bi_en_tmx ### ascidoctor _________________________________________________________________ # Dont' commit books on the working branch, just the website diff --git a/scripts/_run-all-data-scripts.sh b/scripts/_run-all-data-scripts.sh index 3c70ac5..422eab8 100755 --- a/scripts/_run-all-data-scripts.sh +++ b/scripts/_run-all-data-scripts.sh @@ -37,27 +37,6 @@ # ============================================================================== set -e -# TODO: move this to some file related to deploy website -VAR_Gemfile=$(cat << EOF -source 'https://rubygems.org' -gem 'asciidoctor' -gem 'asciidoctor-pdf' -gem 'asciidoctor-epub3' -# https://github.com/asciidoctor/asciidoctor-bibtex -gem 'asciidoctor-bibtex' -# https://github.com/asciidoctor/asciidoctor-chart -gem 'asciidoctor-chart' -## https://github.com/asciidoctor/asciidoctor-latex -# gem 'asciidoctor-latex' -gem 'rouge' -# https://github.com/gjtorikian/html-proofer -gem 'html-proofer' -EOF -) -echo "$VAR_Gemfile" > Gemfile -# bundle install -# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf - ./scripts/data-original-download.sh ./scripts/data-external-prepare.sh diff --git a/scripts/_setup-local-machine.sh b/scripts/_setup-local-machine.sh new file mode 100755 index 0000000..5920182 --- /dev/null +++ b/scripts/_setup-local-machine.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# ============================================================================== +# +# FILE: _setup-local-machine.sh +# +# USAGE: ./scripts/_setup-local-machine.sh +# +# DESCRIPTION: Script NOT related with data generation. +# This is only relevant if you are trying to run scripts on +# your machine (like trying to generate the preview of the +# website) +# +# OPTIONS: --- +# +# REQUIREMENTS: - POSIX Shell or better +# - ruby +# BUGS: --- +# NOTES: --- +# AUTHORS: Emerson Rocha +# COLLABORATORS: <@TODO: put additional non-anonymous names here> +# COMPANY: EticaAI +# LICENSE: Public Domain dedication OR Zero-Clause BSD +# SPDX-License-Identifier: Unlicense OR 0BSD +# VERSION: v1.0 +# CREATED: 2021-11-18 04:43 UTC started +# ============================================================================== +set -e + +# TODO: move this to some file related to deploy website +if [ ! -f 'Gemfile' ]; then + VAR_Gemfile=$(cat << EOF +source 'https://rubygems.org' +gem 'asciidoctor' +gem 'asciidoctor-pdf' +gem 'asciidoctor-epub3' +# https://github.com/asciidoctor/asciidoctor-bibtex +gem 'asciidoctor-bibtex' +# https://github.com/asciidoctor/asciidoctor-chart +gem 'asciidoctor-chart' +## https://github.com/asciidoctor/asciidoctor-latex +# gem 'asciidoctor-latex' +gem 'rouge' +# https://github.com/gjtorikian/html-proofer +gem 'html-proofer' +EOF +) + echo "$VAR_Gemfile" > Gemfile + bundle install +fi + +set -x +bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf +bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub + +set +x +echo 'Okay' \ No newline at end of file diff --git a/scripts/data-info/tico19_tm.csv b/scripts/data-info/tico19_tm.csv new file mode 100644 index 0000000..c839a95 --- /dev/null +++ b/scripts/data-info/tico19_tm.csv @@ -0,0 +1,38 @@ +tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47 +en-ar,en,en,ar,ar +en-bn,en,en,bn,bn +en-ckb,en,en,ckb,ckb +en-din,en,en,din,din +en-es-LA,en,en,es-LA,es-LA +en-fa,en,en,fa,fa +en-fr,en,en,fr,fr +en-fuv,en,en,fuv,fuv +en-ha,en,en,ha,ha +en-hi,en,en,hi,hi +en-id,en,en,id,id +en-km,en,en,km,km +en-kr,en,en,kr,kr +en-ku,en,en,ku,ku +en-lg,en,en,lg,lg +en-ln,en,en,ln,ln +en-mr,en,en,mr,mr +en-ms,en,en,ms,ms +en-my,en,en,my,my +en-ne,en,en,ne,ne +en-nus,en,en,nus,nus +en-om,en,en,om,om +en-prs,en,en,prs,prs +en-ps,en,en,ps,ps +en-pt-BR,en,en,pt-BR,pt-BR +en-ru,en,en,ru,ru +en-rw,en,en,rw,rw +en-so,en,en,so,so +en-sw,en,en,sw,sw +en-ta,en,en,ta,ta +en-ti,en,en,ti,ti +en-ti_ER,en,en,ti_ER,ti-ER +en-ti_ET,en,en,ti_ET,ti-ET +en-tl,en,en,tl,tl +en-ur,en,en,ur,ur +en-zh,en,en,zh,zh +en-zu,en,en,zu,zu diff --git a/scripts/data-original-prepare-translation-memory.sh b/scripts/data-original-prepare-translation-memory.sh index 2cafb22..d690724 100755 --- a/scripts/data-original-prepare-translation-memory.sh +++ b/scripts/data-original-prepare-translation-memory.sh @@ -153,6 +153,49 @@ tico19_tmx_extract "en-ur" tico19_tmx_extract "en-zh" tico19_tmx_extract "en-zu" + +./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv + +# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv' + +./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv +./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv + #### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________ # Save the languages to CSV file to reuse later find data/original/TM/ -iname "all.en-*.zip" \ diff --git a/scripts/fn_tico19_datainfo_tmx.py b/scripts/fn_tico19_datainfo_tmx.py new file mode 100755 index 0000000..4f7e23d --- /dev/null +++ b/scripts/fn_tico19_datainfo_tmx.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# ============================================================================== +# +# FILE: scripts/fn_tico19_datainfo_tmx.py +# +# USAGE: ./scripts/fn_tico19_datainfo_tmx.py +# +# DESCRIPTION: Hardcoded function to generate data about what should be +# converted based on the whatever whas the file naming +# on the original. +# +# OPTIONS: --- +# +# REQUIREMENTS: - python3 +# BUGS: --- +# NOTES: --- +# AUTHORS: Emerson Rocha +# COLLABORATORS: <@TODO: put additional non-anonymous names here> +# COMPANY: EticaAI +# LICENSE: Public Domain dedication OR Zero-Clause BSD +# SPDX-License-Identifier: Unlicense OR 0BSD +# VERSION: v1.0 +# CREATED: 2021-11-20 03:26 UTC +# ============================================================================== + +import sys + +if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help': + print('usage: ' + sys.argv[0] + 'xx-yy') + print('example: ') + print('example: ') + print(' ' + sys.argv[0] + ' csv-header') + print(' ' + sys.argv[0] + ' en-pt-BR') + print(' ' + sys.argv[0] + ' en-ti_ER') + + sys.exit() + +# print(sys.argv[1].find('en-')) + +if sys.argv[1] == 'csv-header': + line_items = [] + line_items.append('tmx_filename_original_lang') + line_items.append('source_lang_original') + line_items.append('source_lang_bcp47') + line_items.append('target_lang_original') + line_items.append('target_lang_bcp47') + print(','.join(line_items)) + sys.exit() + +if sys.argv[1].find('en-') != -1: + + line_items = [] + lang_part_original = sys.argv[1] + lang_part_source_original = 'en' + lang_part_source_bc47 = 'en' + lang_part_target_original = lang_part_original.replace('en-', '') + lang_part_target_bc47 = lang_part_target_original.replace('_', '-') + + line_items.append(lang_part_original) + line_items.append(lang_part_source_original) + line_items.append(lang_part_source_bc47) + line_items.append(lang_part_target_original) + line_items.append(lang_part_target_bc47) + print(','.join(line_items)) + sys.exit() + + +# The way the filenames was so poor that we will not implement en +# all options for a funcion just to allow a quick metadata info. +# We also will generate full dataset later, so no problem +raise Exception('Not implemented')