diff --git a/argostranslate/sbd.py b/argostranslate/sbd.py index ebc418db..65e24a0b 100644 --- a/argostranslate/sbd.py +++ b/argostranslate/sbd.py @@ -1,22 +1,21 @@ from __future__ import annotations from difflib import SequenceMatcher +from typing import List, Optional + +import spacy from argostranslate import package from argostranslate.package import Package from argostranslate.utils import info -from typing import List -from typing import Optional -from difflib import SequenceMatcher - -import spacy class ISentenceBoundaryDetectionModel: # https://github.com/argosopentech/sbd/blob/main/main.py def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]: raise NotImplementedError + # Spacy sentence boundary detection Sentencizer # https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3 @@ -25,7 +24,7 @@ def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[st class SpacySentencizerSmall(ISentenceBoundaryDetectionModel): def __init__(self): try: - self.nlp = spacy.load("xx_sent_ud_sm") + self.nlp = spacy.load("xx_sent_ud_sm") except OSError: spacy.cli.download("xx_sent_ud_sm") self.nlp = spacy.load("xx_sent_ud_sm") @@ -38,6 +37,7 @@ def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[st def __str__(self): return "Spacy xx_sent_ud_sm" + # Few Shot Sentence Boundary Detection fewshot_prompt = """ I walked down to the river. Then I went to the @@ -124,4 +124,4 @@ def detect_sentence( sbd_translated_guess = sbd_translation.translate( DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess ) - return process_seq2seq_sbd(input_text, sbd_translated_guess) \ No newline at end of file + return process_seq2seq_sbd(input_text, sbd_translated_guess) diff --git a/argostranslate/settings.py b/argostranslate/settings.py index bfd1f345..4dd3364b 100644 --- a/argostranslate/settings.py +++ b/argostranslate/settings.py @@ -86,7 +86,7 @@ def get_setting(key: str, default=None): package_index = get_setting( "ARGOS_PACKAGE_INDEX", - default="https://raw.githubusercontent.com/argosopentech/argospm-index/main/index.json", + default="https://raw.githubusercontent.com/argosopentech/argospm-index/main/", ) packages_dir = Path(get_setting("ARGOS_PACKAGES_DIR", default=data_dir / "packages")) diff --git a/argostranslate/tags.py b/argostranslate/tags.py index ff656ccf..b42aac2c 100644 --- a/argostranslate/tags.py +++ b/argostranslate/tags.py @@ -114,7 +114,7 @@ def is_same_structure(tag1: ITag | str, tag2: ITag | str) -> bool: ARGOS_OPEN_TAG = "" ARGOS_CLOSE_TAG = "" -GOLDEN_RATIO = (1 + 5**0.5) / 2 +GOLDEN_RATIO = (1 + 5 ** 0.5) / 2 def flatten_tag(tag: ITag) -> str: diff --git a/argostranslate/translate.py b/argostranslate/translate.py index f8344e08..ca9cf6e8 100644 --- a/argostranslate/translate.py +++ b/argostranslate/translate.py @@ -12,12 +12,11 @@ import argostranslate.models import argostranslate.package import argostranslate.settings -from argostranslate.utils import error, info, warning - from argostranslate import apis, fewshot, package, sbd, settings from argostranslate.package import Package -from argostranslate.utils import info from argostranslate.sbd import SpacySentencizerSmall +from argostranslate.utils import error, info, warning + class Hypothesis: """Represents a translation hypothesis @@ -247,6 +246,7 @@ def __init__(self, from_lang: Language, to_lang: Language, api): def hypotheses(self, from_text: str, num_hypotheses: int = 1) -> list[Hypothesis]: """LibreTranslate only supports single hypotheses. + TODO LibreTranslate now supports multiple hypotheses A list of length num_hypotheses will be returned with identical hypotheses. """ result = self.api.translate(from_text, self.from_lang.code, self.to_lang.code) @@ -269,7 +269,7 @@ def get_chunk_package(from_code): def chunk(from_text, from_code): - # TODO Support Stanza and spacy + # TODO Support Spacy if argostranslate.settings.chunk_type == argostranslate.settings.ChunkType.NONE: return [from_text] elif ( @@ -315,56 +315,6 @@ def apply_chunk_translation(from_text, ctranslate2_translator, sp_processor): return [from_text] -class FewShotTranslation(ITranslation): - """A translation performed with a few shot language model""" - - from_lang: Language - to_lang: Language - language_model: argostranslate.models.ILanguageModel - - def __init__( - self, - from_lang: Language, - to_lang: Language, - language_model: argostranslate.models.ILanguageModel, - ): - self.from_lang = from_lang - self.to_lang = to_lang - self.language_model = language_model - - def hypotheses(self, from_text: str, num_hypotheses: int = 1) -> list[Hypothesis]: - # TODO: Split into chunks - prompt = argostranslate.fewshot.generate_prompt( - from_text, - self.from_lang.name, - self.from_lang.code, - self.to_lang.name, - self.to_lang.code, - ) - info("fewshot prompt", prompt) - response = self.language_model.infer(prompt) - info("fewshot response", response) - if response is None: - error("fewshot response is None") - return [Hypothesis("", 0)] * num_hypotheses - info("fewshot response", response) - result = argostranslate.fewshot.parse_inference(response) - info("fewshot result", result) - return [Hypothesis(result, 0)] * num_hypotheses - - -class LocalTranslation(ITranslation): - def __init__(self, translator, from_lang, to_lang): - self.translator = translator - self.from_lang = from_lang - self.to_lang = to_lang - - def hypotheses(self, from_text, num_hypotheses=4): - return self.translator.translate( - from_text, self.from_lang.code, self.to_lang.code, num_hypotheses - ) - - class Translator: def __init__(self, pkg: argostranslate.package.Package): # TODO: Cache to prevent memory leaks @@ -402,7 +352,6 @@ def remove_target_prefix(self, translated_tokens): def translate(self, from_text, from_code, to_code, num_hypotheses): # Split sentences - # TODO add spacy chunking from v1 sentences = chunk(from_text, from_code) # Tokenize @@ -455,6 +404,18 @@ def get_translation(self, from_lang, to_lang): return LocalTranslation(self, from_lang, to_lang) +class LocalTranslation(ITranslation): + def __init__(self, translator, from_lang, to_lang): + self.translator = translator + self.from_lang = from_lang + self.to_lang = to_lang + + def hypotheses(self, from_text, num_hypotheses=4): + return self.translator.translate( + from_text, self.from_lang.code, self.to_lang.code, num_hypotheses + ) + + def get_installed_languages() -> list[Language]: """Returns a list of Languages installed from packages"""