forked from datasets/language-codes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
language-codes.sh
executable file
·112 lines (97 loc) · 3.3 KB
/
language-codes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
# language-codes.sh - format standard language codes
# usage: language-codes.sh [-fk] [source file | url] [destination]
language-codes() {
DEST="data"
FILE="http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt"
# Arguments
FORCE=false
KEEP=false
OPTIND=1
while getopts ":fk" OPT; do
case "${OPT}" in
f)
# Force overwrite existing paths
FORCE=true
;;
k)
# Keep temporary source copy and log
KEEP=true
;;
esac
done
shift $(($OPTIND - 1))
if [[ -n ${1} ]]; then
FILE="${1}"
fi
if [[ -n ${2} ]]; then
DEST="${2}"
fi
# Internal vars
SELF="${BASH_SOURCE[0]}"
SELF_NAME=$(basename "${SELF}")
SRC="${DEST}/src"
COPY="source.txt"
LOG="log.txt"
# Protect preexisting paths
if [[ ${FORCE} != true ]] && [[ -e ${DEST} ]]; then
echo "${SELF_NAME}: destination directory already exists: ${DEST}" >&2
return 1
fi
if [[ ${FORCE} != true ]] && [[ -e ${SRC} ]]; then
echo "${SELF_NAME}: temporary source directory already exists: ${SRC}" >&2
return 1
fi
# Create paths
if ! mkdir -p ${DEST} 2>/dev/null; then
echo "${SELF_NAME}: unable to create destination directory: ${DEST}" >&2
return 1
fi
if ! mkdir -p ${SRC} 2>/dev/null; then
echo "${SELF_NAME}: unable to create temporary source directory: ${SRC}" >&2
return 1
fi
# Get source
SUCCESS=false
PROT=$(awk -F':\/\/' '$2 { print $1 }' <<< "${FILE}")
case ${PROT} in
http|https)
if curl -s "${FILE}" > "${SRC}/${COPY}" 2>/dev/null; then
SUCCESS=true
fi
;;
"")
if [[ -f ${FILE} ]] && [[ -r ${FILE} ]] && cp "${FILE}" "${SRC}/${COPY}" 2>/dev/null; then
SUCCESS=true
fi
esac
# Sanity checks
if [[ ${SUCCESS} != true ]]; then
echo "${SELF_NAME}: unable to fetch source: ${FILE}" >&2
return 1
fi
if ! touch "${DEST}/language-codes-full.csv" \
"${DEST}/language-codes.csv" \
"${DEST}/language-codes-3b2.csv"; then
echo "${SELF_NAME}: unable to write output" >&2
return 1
fi
# Format all records and fields
echo '"alpha3-b","alpha3-t","alpha2","English","French"' > "${DEST}/language-codes-full.csv"
cat "${SRC}/${COPY}" | awk -F'|' -v QQ='"' -v OFS='","' 'NR==1 { sub(/^\xef\xbb\xbf/, "") } $1=$1 { print QQ $0 QQ }' >> "${DEST}/language-codes-full.csv"
# Only alpha2
echo '"alpha2","English"' > "${DEST}/language-codes.csv"
cat "${SRC}/${COPY}" | awk -F'|' 'NR==1 { sub(/^\xef\xbb\xbf/, "") } $3 { printf "\"%s\",\"%s\"\n", $3, $4 }' | sort >> "${DEST}/language-codes.csv"
# Only alpha3-b with corresponding alpha2
echo '"alpha3-b","alpha2","English"' > "${DEST}/language-codes-3b2.csv"
cat "${SRC}/${COPY}" | awk -F'|' 'NR==1 { sub(/^\xef\xbb\xbf/, "") } $3 { printf "\"%s\",\"%s\",\"%s\"\n", $1, $3, $4 }' | sort >> "${DEST}/language-codes-3b2.csv"
if [[ ${KEEP} == true ]]; then
# Log source location
echo "${COPY} created from ${FILE}" > "${SRC}/${LOG}"
else
# Clean up
rm -rf "${SRC}"
fi
return 0
}
language-codes "$@"