-
Notifications
You must be signed in to change notification settings - Fork 0
/
unhyphenate.py
62 lines (48 loc) · 2.16 KB
/
unhyphenate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#Ver 1.2
# See: https://github.com/pd3f/dehyphen/issues/7 for why it is needed and what for.
import sys
from dehyphen import FlairScorer, text_to_format
import os
def flatten_list(nested_list):
"""Recursively flatten a nested list into a single list while preserving EOLs at the middle level."""
flat_list = []
# Outer loop for the first level
for outer_item in nested_list:
# Middle loop for the second level
flat_list.append("\n") # Add a single EOL for separation
for middle_item in outer_item:
# Innermost loop for the third level
for word in middle_item:
flat_list.append(word) # Append each word to the flat list
return flat_list
def main(input_file):
# Initialize the scorer for the language - select the right one
scorer = FlairScorer(lang="pl")
# Read the input file
with open(input_file, 'r', encoding='utf-8') as file:
source_text = file.read()
# Format the input text
special_format = text_to_format(source_text)
# Remove hyphens from the text
fixed_hyphens = scorer.dehyphen(special_format)
#print(fixed_hyphens)
# Flatten the nested list of characters into a single string
if isinstance(fixed_hyphens, list):
# Use the flattening function to get a flat list of strings
flat_output = flatten_list(fixed_hyphens)
flattened_text = ' '.join(flat_output) # Join all flattened elements into a single string
else:
flattened_text = fixed_hyphens # In case it's already a string
# Create output file name based on input file name
base_name = os.path.basename(input_file)
output_file = os.path.join(os.path.dirname(input_file), f"{os.path.splitext(base_name)[0]}_unhyphenated.txt")
# Write the fixed text to the output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(flattened_text)
print(f"Unhyphenated text written to {output_file}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python unhyphenate.py <path_to_input_file>")
sys.exit(1)
input_file_path = sys.argv[1]
main(input_file_path)