Skip to content

Commit

Permalink
#1, #4, #5: scripts/fn_tico19_datainfo_tmx.py started
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Nov 20, 2021
1 parent f82d7c0 commit 50135f2
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tico-19-hxltm_etica-ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ jobs:

# bundle exec asciidoctor-pdf --attribute allow-uri-read=1 docs/eng-Latn/index.adoc
# bundle exec asciidoctor-epub --attribute allow-uri-read=1 docs/eng-Latn/index.adoc
# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc -out-file docs/tico-19-hxltm_eng-Latn.pdf
# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
# bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc

# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 docs/eng-Latn/draft-eng-Latn.adoc
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ data/tico-19-terminology-facebook.tm2.hxl.csv
# temporary
data/original/translation-memory/translators-without-borders/all/*.tmx
data/original/translation-memory/translators-without-borders/all/*.zip
data/original/translation-memory/translators-without-borders/all_bi_en_tmx

### ascidoctor _________________________________________________________________
# Dont' commit books on the working branch, just the website
Expand Down
21 changes: 0 additions & 21 deletions scripts/_run-all-data-scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,6 @@
# ==============================================================================
set -e

# TODO: move this to some file related to deploy website
VAR_Gemfile=$(cat << EOF
source 'https://rubygems.org'
gem 'asciidoctor'
gem 'asciidoctor-pdf'
gem 'asciidoctor-epub3'
# https://github.com/asciidoctor/asciidoctor-bibtex
gem 'asciidoctor-bibtex'
# https://github.com/asciidoctor/asciidoctor-chart
gem 'asciidoctor-chart'
## https://github.com/asciidoctor/asciidoctor-latex
# gem 'asciidoctor-latex'
gem 'rouge'
# https://github.com/gjtorikian/html-proofer
gem 'html-proofer'
EOF
)
echo "$VAR_Gemfile" > Gemfile
# bundle install
# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf

./scripts/data-original-download.sh

./scripts/data-external-prepare.sh
Expand Down
56 changes: 56 additions & 0 deletions scripts/_setup-local-machine.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/sh
# ==============================================================================
#
# FILE: _setup-local-machine.sh
#
# USAGE: ./scripts/_setup-local-machine.sh
#
# DESCRIPTION: Script NOT related with data generation.
# This is only relevant if you are trying to run scripts on
# your machine (like trying to generate the preview of the
# website)
#
# OPTIONS: ---
#
# REQUIREMENTS: - POSIX Shell or better
# - ruby
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: <@TODO: put additional non-anonymous names here>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication OR Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0
# CREATED: 2021-11-18 04:43 UTC started
# ==============================================================================
set -e

# TODO: move this to some file related to deploy website
if [ ! -f 'Gemfile' ]; then
VAR_Gemfile=$(cat << EOF
source 'https://rubygems.org'
gem 'asciidoctor'
gem 'asciidoctor-pdf'
gem 'asciidoctor-epub3'
# https://github.com/asciidoctor/asciidoctor-bibtex
gem 'asciidoctor-bibtex'
# https://github.com/asciidoctor/asciidoctor-chart
gem 'asciidoctor-chart'
## https://github.com/asciidoctor/asciidoctor-latex
# gem 'asciidoctor-latex'
gem 'rouge'
# https://github.com/gjtorikian/html-proofer
gem 'html-proofer'
EOF
)
echo "$VAR_Gemfile" > Gemfile
bundle install
fi

set -x
bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub

set +x
echo 'Okay'
38 changes: 38 additions & 0 deletions scripts/data-info/tico19_tm.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47
en-ar,en,en,ar,ar
en-bn,en,en,bn,bn
en-ckb,en,en,ckb,ckb
en-din,en,en,din,din
en-es-LA,en,en,es-LA,es-LA
en-fa,en,en,fa,fa
en-fr,en,en,fr,fr
en-fuv,en,en,fuv,fuv
en-ha,en,en,ha,ha
en-hi,en,en,hi,hi
en-id,en,en,id,id
en-km,en,en,km,km
en-kr,en,en,kr,kr
en-ku,en,en,ku,ku
en-lg,en,en,lg,lg
en-ln,en,en,ln,ln
en-mr,en,en,mr,mr
en-ms,en,en,ms,ms
en-my,en,en,my,my
en-ne,en,en,ne,ne
en-nus,en,en,nus,nus
en-om,en,en,om,om
en-prs,en,en,prs,prs
en-ps,en,en,ps,ps
en-pt-BR,en,en,pt-BR,pt-BR
en-ru,en,en,ru,ru
en-rw,en,en,rw,rw
en-so,en,en,so,so
en-sw,en,en,sw,sw
en-ta,en,en,ta,ta
en-ti,en,en,ti,ti
en-ti_ER,en,en,ti_ER,ti-ER
en-ti_ET,en,en,ti_ET,ti-ET
en-tl,en,en,tl,tl
en-ur,en,en,ur,ur
en-zh,en,en,zh,zh
en-zu,en,en,zu,zu
43 changes: 43 additions & 0 deletions scripts/data-original-prepare-translation-memory.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,49 @@ tico19_tmx_extract "en-ur"
tico19_tmx_extract "en-zh"
tico19_tmx_extract "en-zu"


./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv

# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv'

./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv

#### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________
# Save the languages to CSV file to reuse later
find data/original/TM/ -iname "all.en-*.zip" \
Expand Down
71 changes: 71 additions & 0 deletions scripts/fn_tico19_datainfo_tmx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/python3
# ==============================================================================
#
# FILE: scripts/fn_tico19_datainfo_tmx.py
#
# USAGE: ./scripts/fn_tico19_datainfo_tmx.py
#
# DESCRIPTION: Hardcoded function to generate data about what should be
# converted based on the whatever whas the file naming
# on the original.
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: <@TODO: put additional non-anonymous names here>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication OR Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0
# CREATED: 2021-11-20 03:26 UTC
# ==============================================================================

import sys

if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
print('usage: ' + sys.argv[0] + 'xx-yy')
print('example: ')
print('example: ')
print(' ' + sys.argv[0] + ' csv-header')
print(' ' + sys.argv[0] + ' en-pt-BR')
print(' ' + sys.argv[0] + ' en-ti_ER')

sys.exit()

# print(sys.argv[1].find('en-'))

if sys.argv[1] == 'csv-header':
line_items = []
line_items.append('tmx_filename_original_lang')
line_items.append('source_lang_original')
line_items.append('source_lang_bcp47')
line_items.append('target_lang_original')
line_items.append('target_lang_bcp47')
print(','.join(line_items))
sys.exit()

if sys.argv[1].find('en-') != -1:

line_items = []
lang_part_original = sys.argv[1]
lang_part_source_original = 'en'
lang_part_source_bc47 = 'en'
lang_part_target_original = lang_part_original.replace('en-', '')
lang_part_target_bc47 = lang_part_target_original.replace('_', '-')

line_items.append(lang_part_original)
line_items.append(lang_part_source_original)
line_items.append(lang_part_source_bc47)
line_items.append(lang_part_target_original)
line_items.append(lang_part_target_bc47)
print(','.join(line_items))
sys.exit()


# The way the filenames was so poor that we will not implement en
# all options for a funcion just to allow a quick metadata info.
# We also will generate full dataset later, so no problem
raise Exception('Not implemented')

0 comments on commit 50135f2

Please sign in to comment.