Skip to content

Commit

Permalink
rename
Browse files Browse the repository at this point in the history
  • Loading branch information
sogladev committed Mar 26, 2024
1 parent 839c36f commit 251f1e2
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 75 deletions.
2 changes: 1 addition & 1 deletion src/format_english.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@
"outputs": [],
"source": [
"from utils.utils import replace_word_in_field_with_underscore\n",
"from utils.dataset import load_english as load_data"
"from utils.dataset import load_data"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/format_english.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
DATASET = args.dataset
IS_GENERATE_PDF = args.generate_pdf

load_data = EnglishData(DATASET).load_english
load_data = EnglishData(DATASET).load_data

# %% [markdown]
# ## HTML+PDF all columns alphabetical
Expand Down
2 changes: 1 addition & 1 deletion src/format_spanish.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@
"metadata": {},
"outputs": [],
"source": [
"from utils.dataset import load_spanish2 as load_data2"
"from utils.dataset import load_data as load_data2"
]
},
{
Expand Down
75 changes: 5 additions & 70 deletions src/format_spanish.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
# %%
import re

import pandas as pd

# %%
import argparse

from utils.dataset import SpanishData
Expand All @@ -17,7 +11,8 @@
DATASET = args.dataset
IS_GENERATE_PDF = args.generate_pdf

load_data = SpanishData(DATASET).load_spanish
load_data = SpanishData(DATASET).load_data
cefrs_data_by_dataset = SpanishData(DATASET).cefrs_data_by_dataset

# HTML, Underscore, Shuffled and Alphabetical
def format_html_all_columns(is_shuffle=True, is_alphabetical=False, with_underscore=True):
Expand Down Expand Up @@ -62,38 +57,9 @@ def format_html_all_columns(is_shuffle=True, is_alphabetical=False, with_undersc
f.write('<meta charset="UTF-8">'+html)

filenames = [DATASET+'_alphabetical', DATASET+'_shuffled', DATASET+'_underscore_shuffled', DATASET + '_underscore_alphabetical']
#for filename in filenames:
# cmd = f'pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=1cm -V margin-bottom=1cm -V margin-left=1cm -V margin-right=1cm -c format/table.css '
# os.system(cmd)

# ## HTML+PDF all columns grouped by CEFR

# By Ranking / pseudo-cefr. Shuffled
def cefrs_data_by_dataset(data):
if DATASET=='oxford_3000':
cefrs = ['A1', 'A2', 'B1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:2000],
data.iloc[2000:],
]
elif DATASET=='oxford_5000_exclusive':
cefrs = ['B2', 'C1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:],
]
else:
cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:2000],
data.iloc[2000:3000],
data.iloc[3000:4000],
data.iloc[4000:],
]
return cefrs, data_by_cefr

# Complete to HTML
def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_underscore=True):
data = load_data()
Expand All @@ -104,7 +70,6 @@ def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_und
cefrs, data_by_cefr = cefrs_data_by_dataset(data)

data_by_cefr[1].head()
cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']

html_out = ''
for i, data_slice in enumerate(data_by_cefr):
Expand Down Expand Up @@ -146,17 +111,7 @@ def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_und
# shuffle and alphabetical
# Fix supertabular and add \textit to type
column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
def fix_latex_line(line):
if re.match(r"^\\begin{supertabular}", line):
# Add column_format to supertabular}
return '\\begin{supertabular}'+'{'+column_format+'}'
if re.match(r"^\\.*{tabular}", line):
# Remove {tabular}
return ''
if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
# Italics
return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
return line
fix_latex_line = FixLatexLine(column_format).fix_latex_line

def format_latex_columns(is_alphabetical=False, is_shuffle=True):
# columns = ["word", "type", "english", "frequency_rank"]
Expand Down Expand Up @@ -197,17 +152,7 @@ def format_latex_columns(is_alphabetical=False, is_shuffle=True):
# two column, cefr by rank
# Fix supertabular and add \textit to type
column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
def fix_latex_line(line):
if re.match(r"^\\begin{supertabular}", line):
# Add column_format to supertabular}
return '\\begin{supertabular}'+'{'+column_format+'}'
if re.match(r"^\\.*{tabular}", line):
# Remove {tabular}
return ''
if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
# Italics
return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
return line
fix_latex_line = FixLatexLine(column_format).fix_latex_line

def format_latex_columns_by_cefr(is_alphabetical=True, is_shuffle=False):
data = load_data()
Expand Down Expand Up @@ -260,17 +205,7 @@ def format_latex_columns_by_cefr(is_alphabetical=True, is_shuffle=False):
#column_format = 'p{1.0in}p{3.0in}p{3.0in}' # total 8.3in - 0.7874in - column_width
#column_format = 'p{0.8in}p{1.1in}p{2.55in}p{2.55in}' # total 8.3in - 0.7874in - column_width
column_format = 'p{0.9in}p{1.0in}p{2.8in}p{2.30in}' # total 8.3in - 0.7874in - column_width
def fix_latex_line(line):
if re.match(r"^\\begin{supertabular}", line):
# Add column_format to supertabular}
return '\\begin{supertabular}'+'{'+column_format+'}'
if re.match(r"^\\.*{tabular}", line):
# Remove {tabular}
return ''
if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
# Italics
return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
return line
fix_latex_line = FixLatexLine(column_format).fix_latex_line

def format_latex_columns_by_cefr_with_example(is_alphabetical=True, is_shuffle=False):
data = load_data()
Expand Down
33 changes: 31 additions & 2 deletions src/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ class EnglishData:
def __init__(self, dataset: str):
self.dataset = dataset

def load_english(self):
def load_data(self):
dataset = self.dataset
df = pd.read_pickle(f"./data/english/{dataset}.pkl")
df.head()
Expand All @@ -27,4 +27,33 @@ def load_data(self):
data = data.iloc[:3000] # A1, A2, B1
elif dataset == 'spanish_5000_exclusive':
data = data.iloc[3000:] # B2, C1
return data
return data

# By Ranking / pseudo-cefr. Shuffled
def cefrs_data_by_dataset(self, data):
DATASET = self.dataset
if DATASET=='spanish_3000':
cefrs = ['A1', 'A2', 'B1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:2000],
data.iloc[2000:],
]
elif DATASET=='spanish_5000_exclusive':
cefrs = ['B2', 'C1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:],
]
else:
cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
data_by_cefr = [
data.iloc[:1000],
data.iloc[1000:2000],
data.iloc[2000:3000],
data.iloc[3000:4000],
data.iloc[4000:],
]
return cefrs, data_by_cefr

# Complete to HTML

0 comments on commit 251f1e2

Please sign in to comment.