From 50135f2d356247a8d55e4e56af130c25f00f8fc4 Mon Sep 17 00:00:00 2001
From: Emerson Rocha <rocha@ieee.org>
Date: Sat, 20 Nov 2021 04:28:35 -0300
Subject: [PATCH] #1, #4, #5: scripts/fn_tico19_datainfo_tmx.py started

---
 .github/workflows/tico-19-hxltm_etica-ai.yml  |  2 +-
 .gitignore                                    |  1 +
 scripts/_run-all-data-scripts.sh              | 21 ------
 scripts/_setup-local-machine.sh               | 56 +++++++++++++++
 scripts/data-info/tico19_tm.csv               | 38 ++++++++++
 ...ata-original-prepare-translation-memory.sh | 43 +++++++++++
 scripts/fn_tico19_datainfo_tmx.py             | 71 +++++++++++++++++++
 7 files changed, 210 insertions(+), 22 deletions(-)
 create mode 100755 scripts/_setup-local-machine.sh
 create mode 100644 scripts/data-info/tico19_tm.csv
 create mode 100755 scripts/fn_tico19_datainfo_tmx.py

diff --git a/.github/workflows/tico-19-hxltm_etica-ai.yml b/.github/workflows/tico-19-hxltm_etica-ai.yml
index 8d55822..db08457 100644
--- a/.github/workflows/tico-19-hxltm_etica-ai.yml
+++ b/.github/workflows/tico-19-hxltm_etica-ai.yml
@@ -175,7 +175,7 @@ jobs:
 
       # bundle exec asciidoctor-pdf --attribute allow-uri-read=1 docs/eng-Latn/index.adoc
       # bundle exec asciidoctor-epub --attribute allow-uri-read=1 docs/eng-Latn/index.adoc
-      # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc -out-file docs/tico-19-hxltm_eng-Latn.pdf
+      # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
       # bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc
 
       # bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 docs/eng-Latn/draft-eng-Latn.adoc
diff --git a/.gitignore b/.gitignore
index 8ae8f99..d3149f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ data/tico-19-terminology-facebook.tm2.hxl.csv
 # temporary
 data/original/translation-memory/translators-without-borders/all/*.tmx
 data/original/translation-memory/translators-without-borders/all/*.zip
+data/original/translation-memory/translators-without-borders/all_bi_en_tmx
 
 ### ascidoctor _________________________________________________________________
 # Dont' commit books on the working branch, just the website
diff --git a/scripts/_run-all-data-scripts.sh b/scripts/_run-all-data-scripts.sh
index 3c70ac5..422eab8 100755
--- a/scripts/_run-all-data-scripts.sh
+++ b/scripts/_run-all-data-scripts.sh
@@ -37,27 +37,6 @@
 # ==============================================================================
 set -e
 
-# TODO: move this to some file related to deploy website
-VAR_Gemfile=$(cat << EOF
-source 'https://rubygems.org'
-gem 'asciidoctor'
-gem 'asciidoctor-pdf'
-gem 'asciidoctor-epub3'
-# https://github.com/asciidoctor/asciidoctor-bibtex
-gem 'asciidoctor-bibtex'
-# https://github.com/asciidoctor/asciidoctor-chart
-gem 'asciidoctor-chart'
-## https://github.com/asciidoctor/asciidoctor-latex
-# gem 'asciidoctor-latex'
-gem 'rouge'
-# https://github.com/gjtorikian/html-proofer
-gem 'html-proofer'
-EOF
-)
-echo "$VAR_Gemfile" > Gemfile
-# bundle install
-# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
-
 ./scripts/data-original-download.sh
 
 ./scripts/data-external-prepare.sh
diff --git a/scripts/_setup-local-machine.sh b/scripts/_setup-local-machine.sh
new file mode 100755
index 0000000..5920182
--- /dev/null
+++ b/scripts/_setup-local-machine.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+# ==============================================================================
+#
+#          FILE:  _setup-local-machine.sh
+#
+#         USAGE:  ./scripts/_setup-local-machine.sh
+#
+#   DESCRIPTION: Script NOT related with data generation.
+#                This is only relevant if you are trying to run scripts on
+#                your machine (like trying to generate the preview of the
+#                website)
+#
+#       OPTIONS:  ---
+#
+#  REQUIREMENTS:  - POSIX Shell or better
+#                 - ruby
+#          BUGS:  ---
+#         NOTES:  ---
+#       AUTHORS:  Emerson Rocha <rocha[at]ieee.org>
+# COLLABORATORS:  <@TODO: put additional non-anonymous names here>
+#       COMPANY:  EticaAI
+#       LICENSE:  Public Domain dedication OR Zero-Clause BSD
+#                 SPDX-License-Identifier: Unlicense OR 0BSD
+#       VERSION:  v1.0
+#       CREATED:  2021-11-18 04:43 UTC started
+# ==============================================================================
+set -e
+
+# TODO: move this to some file related to deploy website
+if [ ! -f 'Gemfile' ]; then
+    VAR_Gemfile=$(cat << EOF
+source 'https://rubygems.org'
+gem 'asciidoctor'
+gem 'asciidoctor-pdf'
+gem 'asciidoctor-epub3'
+# https://github.com/asciidoctor/asciidoctor-bibtex
+gem 'asciidoctor-bibtex'
+# https://github.com/asciidoctor/asciidoctor-chart
+gem 'asciidoctor-chart'
+## https://github.com/asciidoctor/asciidoctor-latex
+# gem 'asciidoctor-latex'
+gem 'rouge'
+# https://github.com/gjtorikian/html-proofer
+gem 'html-proofer'
+EOF
+)
+    echo "$VAR_Gemfile" > Gemfile
+    bundle install
+fi
+
+set -x
+bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
+bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub
+
+set +x
+echo 'Okay'
\ No newline at end of file
diff --git a/scripts/data-info/tico19_tm.csv b/scripts/data-info/tico19_tm.csv
new file mode 100644
index 0000000..c839a95
--- /dev/null
+++ b/scripts/data-info/tico19_tm.csv
@@ -0,0 +1,38 @@
+tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47
+en-ar,en,en,ar,ar
+en-bn,en,en,bn,bn
+en-ckb,en,en,ckb,ckb
+en-din,en,en,din,din
+en-es-LA,en,en,es-LA,es-LA
+en-fa,en,en,fa,fa
+en-fr,en,en,fr,fr
+en-fuv,en,en,fuv,fuv
+en-ha,en,en,ha,ha
+en-hi,en,en,hi,hi
+en-id,en,en,id,id
+en-km,en,en,km,km
+en-kr,en,en,kr,kr
+en-ku,en,en,ku,ku
+en-lg,en,en,lg,lg
+en-ln,en,en,ln,ln
+en-mr,en,en,mr,mr
+en-ms,en,en,ms,ms
+en-my,en,en,my,my
+en-ne,en,en,ne,ne
+en-nus,en,en,nus,nus
+en-om,en,en,om,om
+en-prs,en,en,prs,prs
+en-ps,en,en,ps,ps
+en-pt-BR,en,en,pt-BR,pt-BR
+en-ru,en,en,ru,ru
+en-rw,en,en,rw,rw
+en-so,en,en,so,so
+en-sw,en,en,sw,sw
+en-ta,en,en,ta,ta
+en-ti,en,en,ti,ti
+en-ti_ER,en,en,ti_ER,ti-ER
+en-ti_ET,en,en,ti_ET,ti-ET
+en-tl,en,en,tl,tl
+en-ur,en,en,ur,ur
+en-zh,en,en,zh,zh
+en-zu,en,en,zu,zu
diff --git a/scripts/data-original-prepare-translation-memory.sh b/scripts/data-original-prepare-translation-memory.sh
index 2cafb22..d690724 100755
--- a/scripts/data-original-prepare-translation-memory.sh
+++ b/scripts/data-original-prepare-translation-memory.sh
@@ -153,6 +153,49 @@ tico19_tmx_extract "en-ur"
 tico19_tmx_extract "en-zh"
 tico19_tmx_extract "en-zu"
 
+
+./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv
+
+# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv'
+
+./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv
+./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv
+
 #### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________
 # Save the languages to CSV file to reuse later
 find data/original/TM/ -iname "all.en-*.zip" \
diff --git a/scripts/fn_tico19_datainfo_tmx.py b/scripts/fn_tico19_datainfo_tmx.py
new file mode 100755
index 0000000..4f7e23d
--- /dev/null
+++ b/scripts/fn_tico19_datainfo_tmx.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+# ==============================================================================
+#
+#          FILE:  scripts/fn_tico19_datainfo_tmx.py
+#
+#         USAGE:  ./scripts/fn_tico19_datainfo_tmx.py
+#
+#   DESCRIPTION: Hardcoded function to generate data about what should be
+#                converted based on the whatever whas the file naming
+#                on the original.
+#
+#       OPTIONS:  ---
+#
+#  REQUIREMENTS:  - python3
+#          BUGS:  ---
+#         NOTES:  ---
+#       AUTHORS:  Emerson Rocha <rocha[at]ieee.org>
+# COLLABORATORS:  <@TODO: put additional non-anonymous names here>
+#       COMPANY:  EticaAI
+#       LICENSE:  Public Domain dedication OR Zero-Clause BSD
+#                 SPDX-License-Identifier: Unlicense OR 0BSD
+#       VERSION:  v1.0
+#       CREATED:  2021-11-20 03:26 UTC
+# ==============================================================================
+
+import sys
+
+if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
+    print('usage: ' + sys.argv[0] + 'xx-yy')
+    print('example: ')
+    print('example: ')
+    print('         ' + sys.argv[0] + ' csv-header')
+    print('         ' + sys.argv[0] + ' en-pt-BR')
+    print('         ' + sys.argv[0] + ' en-ti_ER')
+
+    sys.exit()
+
+# print(sys.argv[1].find('en-'))
+
+if sys.argv[1] == 'csv-header':
+    line_items = []
+    line_items.append('tmx_filename_original_lang')
+    line_items.append('source_lang_original')
+    line_items.append('source_lang_bcp47')
+    line_items.append('target_lang_original')
+    line_items.append('target_lang_bcp47')
+    print(','.join(line_items))
+    sys.exit()
+
+if sys.argv[1].find('en-') != -1:
+
+    line_items = []
+    lang_part_original = sys.argv[1]
+    lang_part_source_original = 'en'
+    lang_part_source_bc47 = 'en'
+    lang_part_target_original = lang_part_original.replace('en-', '')
+    lang_part_target_bc47 = lang_part_target_original.replace('_', '-')
+
+    line_items.append(lang_part_original)
+    line_items.append(lang_part_source_original)
+    line_items.append(lang_part_source_bc47)
+    line_items.append(lang_part_target_original)
+    line_items.append(lang_part_target_bc47)
+    print(','.join(line_items))
+    sys.exit()
+
+
+# The way the filenames was so poor that we will not implement en
+# all options for a funcion just to allow a quick metadata info.
+# We also will generate full dataset later, so no problem
+raise Exception('Not implemented')