Skip to content

Commit

Permalink
Minor refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
PJ-Finlay committed Aug 13, 2024
1 parent a2cd1f9 commit 635f4eb
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 64 deletions.
14 changes: 7 additions & 7 deletions argostranslate/sbd.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
from __future__ import annotations

from difflib import SequenceMatcher
from typing import List, Optional

import spacy

from argostranslate import package
from argostranslate.package import Package
from argostranslate.utils import info

from typing import List
from typing import Optional
from difflib import SequenceMatcher

import spacy

class ISentenceBoundaryDetectionModel:
# https://github.com/argosopentech/sbd/blob/main/main.py
def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
raise NotImplementedError


# Spacy sentence boundary detection Sentencizer
# https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3

Expand All @@ -25,7 +24,7 @@ def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[st
class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
def __init__(self):
try:
self.nlp = spacy.load("xx_sent_ud_sm")
self.nlp = spacy.load("xx_sent_ud_sm")
except OSError:
spacy.cli.download("xx_sent_ud_sm")
self.nlp = spacy.load("xx_sent_ud_sm")
Expand All @@ -38,6 +37,7 @@ def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[st
def __str__(self):
return "Spacy xx_sent_ud_sm"


# Few Shot Sentence Boundary Detection

fewshot_prompt = """<detect-sentence-boundaries> I walked down to the river. Then I went to the
Expand Down Expand Up @@ -124,4 +124,4 @@ def detect_sentence(
sbd_translated_guess = sbd_translation.translate(
DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess
)
return process_seq2seq_sbd(input_text, sbd_translated_guess)
return process_seq2seq_sbd(input_text, sbd_translated_guess)
2 changes: 1 addition & 1 deletion argostranslate/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def get_setting(key: str, default=None):

package_index = get_setting(
"ARGOS_PACKAGE_INDEX",
default="https://raw.githubusercontent.com/argosopentech/argospm-index/main/index.json",
default="https://raw.githubusercontent.com/argosopentech/argospm-index/main/",
)

packages_dir = Path(get_setting("ARGOS_PACKAGES_DIR", default=data_dir / "packages"))
Expand Down
2 changes: 1 addition & 1 deletion argostranslate/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def is_same_structure(tag1: ITag | str, tag2: ITag | str) -> bool:
ARGOS_OPEN_TAG = "<argos-tag>"
ARGOS_CLOSE_TAG = "</argos-tag>"

GOLDEN_RATIO = (1 + 5**0.5) / 2
GOLDEN_RATIO = (1 + 5 ** 0.5) / 2


def flatten_tag(tag: ITag) -> str:
Expand Down
71 changes: 16 additions & 55 deletions argostranslate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
import argostranslate.models
import argostranslate.package
import argostranslate.settings
from argostranslate.utils import error, info, warning

from argostranslate import apis, fewshot, package, sbd, settings
from argostranslate.package import Package
from argostranslate.utils import info
from argostranslate.sbd import SpacySentencizerSmall
from argostranslate.utils import error, info, warning


class Hypothesis:
"""Represents a translation hypothesis
Expand Down Expand Up @@ -247,6 +246,7 @@ def __init__(self, from_lang: Language, to_lang: Language, api):
def hypotheses(self, from_text: str, num_hypotheses: int = 1) -> list[Hypothesis]:
"""LibreTranslate only supports single hypotheses.
TODO LibreTranslate now supports multiple hypotheses
A list of length num_hypotheses will be returned with identical hypotheses.
"""
result = self.api.translate(from_text, self.from_lang.code, self.to_lang.code)
Expand All @@ -269,7 +269,7 @@ def get_chunk_package(from_code):


def chunk(from_text, from_code):
# TODO Support Stanza and spacy
# TODO Support Spacy
if argostranslate.settings.chunk_type == argostranslate.settings.ChunkType.NONE:
return [from_text]
elif (
Expand Down Expand Up @@ -315,56 +315,6 @@ def apply_chunk_translation(from_text, ctranslate2_translator, sp_processor):
return [from_text]


class FewShotTranslation(ITranslation):
"""A translation performed with a few shot language model"""

from_lang: Language
to_lang: Language
language_model: argostranslate.models.ILanguageModel

def __init__(
self,
from_lang: Language,
to_lang: Language,
language_model: argostranslate.models.ILanguageModel,
):
self.from_lang = from_lang
self.to_lang = to_lang
self.language_model = language_model

def hypotheses(self, from_text: str, num_hypotheses: int = 1) -> list[Hypothesis]:
# TODO: Split into chunks
prompt = argostranslate.fewshot.generate_prompt(
from_text,
self.from_lang.name,
self.from_lang.code,
self.to_lang.name,
self.to_lang.code,
)
info("fewshot prompt", prompt)
response = self.language_model.infer(prompt)
info("fewshot response", response)
if response is None:
error("fewshot response is None")
return [Hypothesis("", 0)] * num_hypotheses
info("fewshot response", response)
result = argostranslate.fewshot.parse_inference(response)
info("fewshot result", result)
return [Hypothesis(result, 0)] * num_hypotheses


class LocalTranslation(ITranslation):
def __init__(self, translator, from_lang, to_lang):
self.translator = translator
self.from_lang = from_lang
self.to_lang = to_lang

def hypotheses(self, from_text, num_hypotheses=4):
return self.translator.translate(
from_text, self.from_lang.code, self.to_lang.code, num_hypotheses
)


class Translator:
def __init__(self, pkg: argostranslate.package.Package):
# TODO: Cache to prevent memory leaks
Expand Down Expand Up @@ -402,7 +352,6 @@ def remove_target_prefix(self, translated_tokens):

def translate(self, from_text, from_code, to_code, num_hypotheses):
# Split sentences
# TODO add spacy chunking from v1
sentences = chunk(from_text, from_code)

# Tokenize
Expand Down Expand Up @@ -455,6 +404,18 @@ def get_translation(self, from_lang, to_lang):
return LocalTranslation(self, from_lang, to_lang)


class LocalTranslation(ITranslation):
def __init__(self, translator, from_lang, to_lang):
self.translator = translator
self.from_lang = from_lang
self.to_lang = to_lang

def hypotheses(self, from_text, num_hypotheses=4):
return self.translator.translate(
from_text, self.from_lang.code, self.to_lang.code, num_hypotheses
)


def get_installed_languages() -> list[Language]:
"""Returns a list of Languages installed from packages"""

Expand Down

0 comments on commit 635f4eb

Please sign in to comment.