From 7677d8e66be1f347c629ecd02247bc84a7b5eaea Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Fri, 25 Oct 2024 10:04:33 -0400
Subject: [PATCH 1/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 616 +++++++++++++++++++-----------------
 1 file changed, 318 insertions(+), 298 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 3bf76a5f..8bcf2c8d 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -6,309 +6,329 @@
 import tokenizers
 
 
+class TokenizationError(Exception):
+   pass
+
+
 class Tokenizer:
-    """Simple wrapper around a tokenizers.Tokenizer."""
-
-    def __init__(
-        self,
-        tokenizer: tokenizers.Tokenizer,
-        multilingual: bool,
-        task: Optional[str] = None,
-        language: Optional[str] = None,
-    ):
-        self.tokenizer = tokenizer
-
-        if multilingual:
-            if task not in _TASKS:
-                raise ValueError(
-                    "'%s' is not a valid task (accepted tasks: %s)"
-                    % (task, ", ".join(_TASKS))
-                )
-
-            if language not in _LANGUAGE_CODES:
-                raise ValueError(
-                    "'%s' is not a valid language code (accepted language codes: %s)"
-                    % (language, ", ".join(_LANGUAGE_CODES))
-                )
-
-            self.task = self.tokenizer.token_to_id("<|%s|>" % task)
-            self.language = self.tokenizer.token_to_id("<|%s|>" % language)
-            self.language_code = language
-        else:
-            self.task = None
-            self.language = None
-            self.language_code = "en"
-
-    @cached_property
-    def transcribe(self) -> int:
-        return self.tokenizer.token_to_id("<|transcribe|>")
-
-    @cached_property
-    def translate(self) -> int:
-        return self.tokenizer.token_to_id("<|translate|>")
-
-    @cached_property
-    def sot(self) -> int:
-        return self.tokenizer.token_to_id("<|startoftranscript|>")
-
-    @cached_property
-    def sot_lm(self) -> int:
-        return self.tokenizer.token_to_id("<|startoflm|>")
-
-    @cached_property
-    def sot_prev(self) -> int:
-        return self.tokenizer.token_to_id("<|startofprev|>")
-
-    @cached_property
-    def eot(self) -> int:
-        return self.tokenizer.token_to_id("<|endoftext|>")
-
-    @cached_property
-    def no_timestamps(self) -> int:
-        return self.tokenizer.token_to_id("<|notimestamps|>")
-
-    @property
-    def timestamp_begin(self) -> int:
-        return self.no_timestamps + 1
-
-    @property
-    def sot_sequence(self) -> List[int]:
-        sequence = [self.sot]
-
-        if self.language is not None:
-            sequence.append(self.language)
-
-        if self.task is not None:
-            sequence.append(self.task)
-
-        return sequence
-
-    def encode(self, text: str) -> List[int]:
-        return self.tokenizer.encode(text, add_special_tokens=False).ids
-
-    def decode(self, tokens: List[int]) -> str:
-        text_tokens = [token for token in tokens if token < self.eot]
-        return self.tokenizer.decode(text_tokens)
-
-    def decode_with_timestamps(self, tokens: List[int]) -> str:
-        outputs = [[]]
-
-        for token in tokens:
-            if token >= self.timestamp_begin:
-                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
-                outputs.append(timestamp)
-                outputs.append([])
-            else:
-                outputs[-1].append(token)
-
-        return "".join(
-            [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
-        )
-
-    @cached_property
-    def non_speech_tokens(self) -> Tuple[int]:
-        """
-        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
-        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
-
-        - ♪♪♪
-        - ( SPEAKING FOREIGN LANGUAGE )
-        - [DAVID] Hey there,
-
-        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
-        """
-        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
-        symbols += (
-            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
-        )
-
-        # symbols that may be a single token or multiple tokens depending on the tokenizer.
-        # In case they're multiple tokens, suppress the first token, which is safe because:
-        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
-        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
-        miscellaneous = set("♩♪♫♬♭♮♯")
-        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
-
-        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-        result = {self.encode(" -")[0], self.encode(" '")[0]}
-        for symbol in symbols + list(miscellaneous):
-            for tokens in [
-                self.encode(symbol),
-                self.encode(" " + symbol),
-            ]:
-                if len(tokens) == 1 or symbol in miscellaneous:
-                    result.add(tokens[0])
-
-        return tuple(sorted(result))
-
-    def split_to_word_tokens(
-        self, tokens: List[int]
-    ) -> Tuple[List[str], List[List[int]]]:
-        if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
-            # These languages don't typically use spaces, so it is difficult to split words
-            # without morpheme analysis. Here, we instead split words at any
-            # position where the tokens are decoded as valid unicode points
-            return self.split_tokens_on_unicode(tokens)
-
-        return self.split_tokens_on_spaces(tokens)
-
-    def split_tokens_on_unicode(
-        self, tokens: List[int]
-    ) -> Tuple[List[str], List[List[int]]]:
-        decoded_full = self.decode_with_timestamps(tokens)
-        replacement_char = "\ufffd"
-
-        words = []
-        word_tokens = []
-        current_tokens = []
-        unicode_offset = 0
-
-        for token in tokens:
-            current_tokens.append(token)
-            decoded = self.decode_with_timestamps(current_tokens)
-
-            try:
-                replacement_char_index = decoded.index(replacement_char)
-                replacement_char_index += unicode_offset
-            except ValueError:
-                replacement_char_index = None
-
-            if replacement_char_index is None or (
-                replacement_char_index < len(decoded_full)
-                and decoded_full[replacement_char_index] == replacement_char
-            ):
-                words.append(decoded)
-                word_tokens.append(current_tokens)
-                current_tokens = []
-                unicode_offset += len(decoded)
-
-        return words, word_tokens
-
-    def split_tokens_on_spaces(
-        self, tokens: List[int]
-    ) -> Tuple[List[str], List[List[int]]]:
-        subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
-        words = []
-        word_tokens = []
-
-        for subword, subword_tokens in zip(subwords, subword_tokens_list):
-            special = subword_tokens[0] >= self.eot
-            with_space = subword.startswith(" ")
-            punctuation = subword.strip() in string.punctuation
-            if special or with_space or punctuation or len(words) == 0:
-                words.append(subword)
-                word_tokens.append(subword_tokens)
-            else:
-                words[-1] = words[-1] + subword
-                word_tokens[-1].extend(subword_tokens)
-
-        return words, word_tokens
+   """Simple wrapper around a tokenizers.Tokenizer."""
+
+   def __init__(
+       self,
+       tokenizer: tokenizers.Tokenizer,
+       multilingual: bool,
+       task: Optional[str] = None,
+       language: Optional[str] = None,
+   ):
+       self.tokenizer = tokenizer
+
+       if multilingual:
+           if task not in _TASKS:
+               raise ValueError(
+                   "'%s' is not a valid task (accepted tasks: %s)"
+                   % (task, ", ".join(_TASKS))
+               )
+
+           if language not in _LANGUAGE_CODES:
+               raise ValueError(
+                   "'%s' is not a valid language code (accepted language codes: %s)"
+                   % (language, ", ".join(_LANGUAGE_CODES))
+               )
+
+           self.task = self.tokenizer.token_to_id("<|%s|>" % task)
+           self.language = self.tokenizer.token_to_id("<|%s|>" % language)
+           self.language_code = language
+       else:
+           self.task = None
+           self.language = None
+           self.language_code = "en"
+
+   @cached_property
+   def transcribe(self) -> int:
+       return self.tokenizer.token_to_id("<|transcribe|>")
+
+   @cached_property
+   def translate(self) -> int:
+       return self.tokenizer.token_to_id("<|translate|>")
+
+   @cached_property
+   def sot(self) -> int:
+       return self.tokenizer.token_to_id("<|startoftranscript|>")
+
+   @cached_property
+   def sot_lm(self) -> int:
+       return self.tokenizer.token_to_id("<|startoflm|>")
+
+   @cached_property
+   def sot_prev(self) -> int:
+       return self.tokenizer.token_to_id("<|startofprev|>")
+
+   @cached_property
+   def eot(self) -> int:
+       return self.tokenizer.token_to_id("<|endoftext|>")
+
+   @cached_property
+   def no_timestamps(self) -> int:
+       return self.tokenizer.token_to_id("<|notimestamps|>")
+
+   @property
+   def timestamp_begin(self) -> int:
+       return self.no_timestamps + 1
+
+   @property
+   def sot_sequence(self) -> List[int]:
+       sequence = [self.sot]
+
+       if self.language is not None:
+           sequence.append(self.language)
+
+       if self.task is not None:
+           sequence.append(self.task)
+
+       return sequence
+
+   def encode(self, text: str) -> List[int]:
+       return self.tokenizer.encode(text, add_special_tokens=False).ids
+
+   def decode(self, tokens: List[int]) -> str:
+       try:
+           text_tokens = [token for token in tokens if token < self.eot]
+           if not text_tokens:
+               raise ValueError("No valid text tokens found")
+           if any(not isinstance(t, int) or t < 0 for t in text_tokens):
+               raise ValueError("Invalid token values detected")
+           return self.tokenizer.decode(text_tokens)
+       except Exception as e:
+           raise TokenizationError(f"Failed to decode tokens: {e}") from e
+
+   def decode_with_timestamps(self, tokens: List[int]) -> str:
+       try:
+           if not tokens:
+               raise ValueError("Empty token sequence")
+           if any(not isinstance(t, int) or t < 0 for t in tokens):
+               raise ValueError("Invalid token values detected")
+           
+           outputs = [[]]
+           for token in tokens:
+               if token >= self.timestamp_begin:
+                   timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                   outputs.append(timestamp)
+                   outputs.append([])
+               else:
+                   outputs[-1].append(token)
+
+           decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+           if not any(decoded):
+               raise ValueError("Decoding produced no valid output")
+               
+           return "".join(decoded)
+       except Exception as e:
+           raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e
+
+   @cached_property
+   def non_speech_tokens(self) -> Tuple[int]:
+       """
+       Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+       annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+
+       - ♪♪♪
+       - ( SPEAKING FOREIGN LANGUAGE )
+       - [DAVID] Hey there,
+
+       keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+       """
+       symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
+       symbols += (
+           "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+       )
+
+       # symbols that may be a single token or multiple tokens depending on the tokenizer.
+       # In case they're multiple tokens, suppress the first token, which is safe because:
+       # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+       # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+       miscellaneous = set("♩♪♫♬♭♮♯")
+       assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+
+       # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+       result = {self.encode(" -")[0], self.encode(" '")[0]}
+       for symbol in symbols + list(miscellaneous):
+           for tokens in [
+               self.encode(symbol),
+               self.encode(" " + symbol),
+           ]:
+               if len(tokens) == 1 or symbol in miscellaneous:
+                   result.add(tokens[0])
+
+       return tuple(sorted(result))
+
+   def split_to_word_tokens(
+       self, tokens: List[int]
+   ) -> Tuple[List[str], List[List[int]]]:
+       if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
+           # These languages don't typically use spaces, so it is difficult to split words
+           # without morpheme analysis. Here, we instead split words at any
+           # position where the tokens are decoded as valid unicode points
+           return self.split_tokens_on_unicode(tokens)
+
+       return self.split_tokens_on_spaces(tokens)
+
+   def split_tokens_on_unicode(
+       self, tokens: List[int]
+   ) -> Tuple[List[str], List[List[int]]]:
+       decoded_full = self.decode_with_timestamps(tokens)
+       replacement_char = "\ufffd"
+
+       words = []
+       word_tokens = []
+       current_tokens = []
+       unicode_offset = 0
+
+       for token in tokens:
+           current_tokens.append(token)
+           decoded = self.decode_with_timestamps(current_tokens)
+
+           try:
+               replacement_char_index = decoded.index(replacement_char)
+               replacement_char_index += unicode_offset
+           except ValueError:
+               replacement_char_index = None
+
+           if replacement_char_index is None or (
+               replacement_char_index < len(decoded_full)
+               and decoded_full[replacement_char_index] == replacement_char
+           ):
+               words.append(decoded)
+               word_tokens.append(current_tokens)
+               current_tokens = []
+               unicode_offset += len(decoded)
+
+       return words, word_tokens
+
+   def split_tokens_on_spaces(
+       self, tokens: List[int]
+   ) -> Tuple[List[str], List[List[int]]]:
+       subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
+       words = []
+       word_tokens = []
+
+       for subword, subword_tokens in zip(subwords, subword_tokens_list):
+           special = subword_tokens[0] >= self.eot
+           with_space = subword.startswith(" ")
+           punctuation = subword.strip() in string.punctuation
+           if special or with_space or punctuation or len(words) == 0:
+               words.append(subword)
+               word_tokens.append(subword_tokens)
+           else:
+               words[-1] = words[-1] + subword
+               word_tokens[-1].extend(subword_tokens)
+
+       return words, word_tokens
 
 
 _TASKS = (
-    "transcribe",
-    "translate",
+   "transcribe",
+   "translate",
 )
 
 _LANGUAGE_CODES = (
-    "af",
-    "am",
-    "ar",
-    "as",
-    "az",
-    "ba",
-    "be",
-    "bg",
-    "bn",
-    "bo",
-    "br",
-    "bs",
-    "ca",
-    "cs",
-    "cy",
-    "da",
-    "de",
-    "el",
-    "en",
-    "es",
-    "et",
-    "eu",
-    "fa",
-    "fi",
-    "fo",
-    "fr",
-    "gl",
-    "gu",
-    "ha",
-    "haw",
-    "he",
-    "hi",
-    "hr",
-    "ht",
-    "hu",
-    "hy",
-    "id",
-    "is",
-    "it",
-    "ja",
-    "jw",
-    "ka",
-    "kk",
-    "km",
-    "kn",
-    "ko",
-    "la",
-    "lb",
-    "ln",
-    "lo",
-    "lt",
-    "lv",
-    "mg",
-    "mi",
-    "mk",
-    "ml",
-    "mn",
-    "mr",
-    "ms",
-    "mt",
-    "my",
-    "ne",
-    "nl",
-    "nn",
-    "no",
-    "oc",
-    "pa",
-    "pl",
-    "ps",
-    "pt",
-    "ro",
-    "ru",
-    "sa",
-    "sd",
-    "si",
-    "sk",
-    "sl",
-    "sn",
-    "so",
-    "sq",
-    "sr",
-    "su",
-    "sv",
-    "sw",
-    "ta",
-    "te",
-    "tg",
-    "th",
-    "tk",
-    "tl",
-    "tr",
-    "tt",
-    "uk",
-    "ur",
-    "uz",
-    "vi",
-    "yi",
-    "yo",
-    "zh",
-    "yue",
+   "af",
+   "am",
+   "ar",
+   "as",
+   "az",
+   "ba",
+   "be",
+   "bg",
+   "bn",
+   "bo",
+   "br",
+   "bs",
+   "ca",
+   "cs",
+   "cy",
+   "da",
+   "de",
+   "el",
+   "en",
+   "es",
+   "et",
+   "eu",
+   "fa",
+   "fi",
+   "fo",
+   "fr",
+   "gl",
+   "gu",
+   "ha",
+   "haw",
+   "he",
+   "hi",
+   "hr",
+   "ht",
+   "hu",
+   "hy",
+   "id",
+   "is",
+   "it",
+   "ja",
+   "jw",
+   "ka",
+   "kk",
+   "km",
+   "kn",
+   "ko",
+   "la",
+   "lb",
+   "ln",
+   "lo",
+   "lt",
+   "lv",
+   "mg",
+   "mi",
+   "mk",
+   "ml",
+   "mn",
+   "mr",
+   "ms",
+   "mt",
+   "my",
+   "ne",
+   "nl",
+   "nn",
+   "no",
+   "oc",
+   "pa",
+   "pl",
+   "ps",
+   "pt",
+   "ro",
+   "ru",
+   "sa",
+   "sd",
+   "si",
+   "sk",
+   "sl",
+   "sn",
+   "so",
+   "sq",
+   "sr",
+   "su",
+   "sv",
+   "sw",
+   "ta",
+   "te",
+   "tg",
+   "th",
+   "tk",
+   "tl",
+   "tr",
+   "tt",
+   "uk",
+   "ur",
+   "uz",
+   "vi",
+   "yi",
+   "yo",
+   "zh",
+   "yue",
 )

From 6aee94e761535fc6a10f39460b0cd2550b2aaa5f Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 16:38:39 -0400
Subject: [PATCH 2/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 630 ++++++++++++++++++------------------
 1 file changed, 315 insertions(+), 315 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 8bcf2c8d..3eae92a5 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -7,328 +7,328 @@
 
 
 class TokenizationError(Exception):
-   pass
+    pass
 
 
 class Tokenizer:
-   """Simple wrapper around a tokenizers.Tokenizer."""
-
-   def __init__(
-       self,
-       tokenizer: tokenizers.Tokenizer,
-       multilingual: bool,
-       task: Optional[str] = None,
-       language: Optional[str] = None,
-   ):
-       self.tokenizer = tokenizer
-
-       if multilingual:
-           if task not in _TASKS:
-               raise ValueError(
-                   "'%s' is not a valid task (accepted tasks: %s)"
-                   % (task, ", ".join(_TASKS))
-               )
-
-           if language not in _LANGUAGE_CODES:
-               raise ValueError(
-                   "'%s' is not a valid language code (accepted language codes: %s)"
-                   % (language, ", ".join(_LANGUAGE_CODES))
-               )
-
-           self.task = self.tokenizer.token_to_id("<|%s|>" % task)
-           self.language = self.tokenizer.token_to_id("<|%s|>" % language)
-           self.language_code = language
-       else:
-           self.task = None
-           self.language = None
-           self.language_code = "en"
-
-   @cached_property
-   def transcribe(self) -> int:
-       return self.tokenizer.token_to_id("<|transcribe|>")
-
-   @cached_property
-   def translate(self) -> int:
-       return self.tokenizer.token_to_id("<|translate|>")
-
-   @cached_property
-   def sot(self) -> int:
-       return self.tokenizer.token_to_id("<|startoftranscript|>")
-
-   @cached_property
-   def sot_lm(self) -> int:
-       return self.tokenizer.token_to_id("<|startoflm|>")
-
-   @cached_property
-   def sot_prev(self) -> int:
-       return self.tokenizer.token_to_id("<|startofprev|>")
-
-   @cached_property
-   def eot(self) -> int:
-       return self.tokenizer.token_to_id("<|endoftext|>")
-
-   @cached_property
-   def no_timestamps(self) -> int:
-       return self.tokenizer.token_to_id("<|notimestamps|>")
-
-   @property
-   def timestamp_begin(self) -> int:
-       return self.no_timestamps + 1
-
-   @property
-   def sot_sequence(self) -> List[int]:
-       sequence = [self.sot]
-
-       if self.language is not None:
-           sequence.append(self.language)
-
-       if self.task is not None:
-           sequence.append(self.task)
-
-       return sequence
-
-   def encode(self, text: str) -> List[int]:
-       return self.tokenizer.encode(text, add_special_tokens=False).ids
-
-   def decode(self, tokens: List[int]) -> str:
-       try:
-           text_tokens = [token for token in tokens if token < self.eot]
-           if not text_tokens:
-               raise ValueError("No valid text tokens found")
-           if any(not isinstance(t, int) or t < 0 for t in text_tokens):
-               raise ValueError("Invalid token values detected")
-           return self.tokenizer.decode(text_tokens)
-       except Exception as e:
-           raise TokenizationError(f"Failed to decode tokens: {e}") from e
-
-   def decode_with_timestamps(self, tokens: List[int]) -> str:
-       try:
-           if not tokens:
-               raise ValueError("Empty token sequence")
-           if any(not isinstance(t, int) or t < 0 for t in tokens):
-               raise ValueError("Invalid token values detected")
-           
-           outputs = [[]]
-           for token in tokens:
-               if token >= self.timestamp_begin:
-                   timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
-                   outputs.append(timestamp)
-                   outputs.append([])
-               else:
-                   outputs[-1].append(token)
-
-           decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
-           if not any(decoded):
-               raise ValueError("Decoding produced no valid output")
-               
-           return "".join(decoded)
-       except Exception as e:
-           raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e
-
-   @cached_property
-   def non_speech_tokens(self) -> Tuple[int]:
-       """
-       Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
-       annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
-
-       - ♪♪♪
-       - ( SPEAKING FOREIGN LANGUAGE )
-       - [DAVID] Hey there,
-
-       keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
-       """
-       symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
-       symbols += (
-           "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
-       )
-
-       # symbols that may be a single token or multiple tokens depending on the tokenizer.
-       # In case they're multiple tokens, suppress the first token, which is safe because:
-       # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
-       # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
-       miscellaneous = set("♩♪♫♬♭♮♯")
-       assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
-
-       # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-       result = {self.encode(" -")[0], self.encode(" '")[0]}
-       for symbol in symbols + list(miscellaneous):
-           for tokens in [
-               self.encode(symbol),
-               self.encode(" " + symbol),
-           ]:
-               if len(tokens) == 1 or symbol in miscellaneous:
-                   result.add(tokens[0])
-
-       return tuple(sorted(result))
-
-   def split_to_word_tokens(
-       self, tokens: List[int]
-   ) -> Tuple[List[str], List[List[int]]]:
-       if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
-           # These languages don't typically use spaces, so it is difficult to split words
-           # without morpheme analysis. Here, we instead split words at any
-           # position where the tokens are decoded as valid unicode points
-           return self.split_tokens_on_unicode(tokens)
-
-       return self.split_tokens_on_spaces(tokens)
-
-   def split_tokens_on_unicode(
-       self, tokens: List[int]
-   ) -> Tuple[List[str], List[List[int]]]:
-       decoded_full = self.decode_with_timestamps(tokens)
-       replacement_char = "\ufffd"
-
-       words = []
-       word_tokens = []
-       current_tokens = []
-       unicode_offset = 0
-
-       for token in tokens:
-           current_tokens.append(token)
-           decoded = self.decode_with_timestamps(current_tokens)
-
-           try:
-               replacement_char_index = decoded.index(replacement_char)
-               replacement_char_index += unicode_offset
-           except ValueError:
-               replacement_char_index = None
-
-           if replacement_char_index is None or (
-               replacement_char_index < len(decoded_full)
-               and decoded_full[replacement_char_index] == replacement_char
-           ):
-               words.append(decoded)
-               word_tokens.append(current_tokens)
-               current_tokens = []
-               unicode_offset += len(decoded)
-
-       return words, word_tokens
-
-   def split_tokens_on_spaces(
-       self, tokens: List[int]
-   ) -> Tuple[List[str], List[List[int]]]:
-       subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
-       words = []
-       word_tokens = []
-
-       for subword, subword_tokens in zip(subwords, subword_tokens_list):
-           special = subword_tokens[0] >= self.eot
-           with_space = subword.startswith(" ")
-           punctuation = subword.strip() in string.punctuation
-           if special or with_space or punctuation or len(words) == 0:
-               words.append(subword)
-               word_tokens.append(subword_tokens)
-           else:
-               words[-1] = words[-1] + subword
-               word_tokens[-1].extend(subword_tokens)
-
-       return words, word_tokens
+    """Simple wrapper around a tokenizers.Tokenizer."""
+
+    def __init__(
+        self,
+        tokenizer: tokenizers.Tokenizer,
+        multilingual: bool,
+        task: Optional[str] = None,
+        language: Optional[str] = None,
+    ):
+        self.tokenizer = tokenizer
+
+        if multilingual:
+            if task not in _TASKS:
+                raise ValueError(
+                    "'%s' is not a valid task (accepted tasks: %s)"
+                    % (task, ", ".join(_TASKS))
+                )
+
+            if language not in _LANGUAGE_CODES:
+                raise ValueError(
+                    "'%s' is not a valid language code (accepted language codes: %s)"
+                    % (language, ", ".join(_LANGUAGE_CODES))
+                )
+
+            self.task = self.tokenizer.token_to_id("<|%s|>" % task)
+            self.language = self.tokenizer.token_to_id("<|%s|>" % language)
+            self.language_code = language
+        else:
+            self.task = None
+            self.language = None
+            self.language_code = "en"
+
+    @cached_property
+    def transcribe(self) -> int:
+        return self.tokenizer.token_to_id("<|transcribe|>")
+
+    @cached_property
+    def translate(self) -> int:
+        return self.tokenizer.token_to_id("<|translate|>")
+
+    @cached_property
+    def sot(self) -> int:
+        return self.tokenizer.token_to_id("<|startoftranscript|>")
+
+    @cached_property
+    def sot_lm(self) -> int:
+        return self.tokenizer.token_to_id("<|startoflm|>")
+
+    @cached_property
+    def sot_prev(self) -> int:
+        return self.tokenizer.token_to_id("<|startofprev|>")
+
+    @cached_property
+    def eot(self) -> int:
+        return self.tokenizer.token_to_id("<|endoftext|>")
+
+    @cached_property
+    def no_timestamps(self) -> int:
+        return self.tokenizer.token_to_id("<|notimestamps|>")
+
+    @property
+    def timestamp_begin(self) -> int:
+        return self.no_timestamps + 1
+
+    @property
+    def sot_sequence(self) -> List[int]:
+        sequence = [self.sot]
+
+        if self.language is not None:
+            sequence.append(self.language)
+
+        if self.task is not None:
+            sequence.append(self.task)
+
+        return sequence
+
+    def encode(self, text: str) -> List[int]:
+        return self.tokenizer.encode(text, add_special_tokens=False).ids
+
+    def decode(self, tokens: List[int]) -> str:
+        try:
+            text_tokens = [token for token in tokens if token < self.eot]
+            if not text_tokens:
+                raise ValueError("No valid text tokens found")
+            if any(not isinstance(t, int) or t < 0 for t in text_tokens):
+                raise ValueError("Invalid token values detected")
+            return self.tokenizer.decode(text_tokens)
+        except Exception as e:
+            raise TokenizationError(f"Failed to decode tokens: {e}") from e
+
+    def decode_with_timestamps(self, tokens: List[int]) -> str:
+        try:
+            if not tokens:
+                raise ValueError("Empty token sequence")
+            if any(not isinstance(t, int) or t < 0 for t in tokens):
+                raise ValueError("Invalid token values detected")
+            
+            outputs = [[]]
+            for token in tokens:
+                if token >= self.timestamp_begin:
+                    timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                    outputs.append(timestamp)
+                    outputs.append([])
+                else:
+                    outputs[-1].append(token)
+
+            decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+            if not any(decoded):
+                raise ValueError("Decoding produced no valid output")
+                
+            return "".join(decoded)
+        except Exception as e:
+            raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e
+
+    @cached_property
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
+        symbols += (
+            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+        )
+
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.encode(" -")[0], self.encode(" '")[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [
+                self.encode(symbol),
+                self.encode(" " + symbol),
+            ]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+
+        return tuple(sorted(result))
+
+    def split_to_word_tokens(
+        self, tokens: List[int]
+    ) -> Tuple[List[str], List[List[int]]]:
+        if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
+            # These languages don't typically use spaces, so it is difficult to split words
+            # without morpheme analysis. Here, we instead split words at any
+            # position where the tokens are decoded as valid unicode points
+            return self.split_tokens_on_unicode(tokens)
+
+        return self.split_tokens_on_spaces(tokens)
+
+    def split_tokens_on_unicode(
+        self, tokens: List[int]
+    ) -> Tuple[List[str], List[List[int]]]:
+        decoded_full = self.decode_with_timestamps(tokens)
+        replacement_char = "\ufffd"
+
+        words = []
+        word_tokens = []
+        current_tokens = []
+        unicode_offset = 0
+
+        for token in tokens:
+            current_tokens.append(token)
+            decoded = self.decode_with_timestamps(current_tokens)
+
+            try:
+                replacement_char_index = decoded.index(replacement_char)
+                replacement_char_index += unicode_offset
+            except ValueError:
+                replacement_char_index = None
+
+            if replacement_char_index is None or (
+                replacement_char_index < len(decoded_full)
+                and decoded_full[replacement_char_index] == replacement_char
+            ):
+                words.append(decoded)
+                word_tokens.append(current_tokens)
+                current_tokens = []
+                unicode_offset += len(decoded)
+
+        return words, word_tokens
+
+    def split_tokens_on_spaces(
+        self, tokens: List[int]
+    ) -> Tuple[List[str], List[List[int]]]:
+        subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
+        words = []
+        word_tokens = []
+
+        for subword, subword_tokens in zip(subwords, subword_tokens_list):
+            special = subword_tokens[0] >= self.eot
+            with_space = subword.startswith(" ")
+            punctuation = subword.strip() in string.punctuation
+            if special or with_space or punctuation or len(words) == 0:
+                words.append(subword)
+                word_tokens.append(subword_tokens)
+            else:
+                words[-1] = words[-1] + subword
+                word_tokens[-1].extend(subword_tokens)
+
+        return words, word_tokens
 
 
 _TASKS = (
-   "transcribe",
-   "translate",
+    "transcribe",
+    "translate",
 )
 
 _LANGUAGE_CODES = (
-   "af",
-   "am",
-   "ar",
-   "as",
-   "az",
-   "ba",
-   "be",
-   "bg",
-   "bn",
-   "bo",
-   "br",
-   "bs",
-   "ca",
-   "cs",
-   "cy",
-   "da",
-   "de",
-   "el",
-   "en",
-   "es",
-   "et",
-   "eu",
-   "fa",
-   "fi",
-   "fo",
-   "fr",
-   "gl",
-   "gu",
-   "ha",
-   "haw",
-   "he",
-   "hi",
-   "hr",
-   "ht",
-   "hu",
-   "hy",
-   "id",
-   "is",
-   "it",
-   "ja",
-   "jw",
-   "ka",
-   "kk",
-   "km",
-   "kn",
-   "ko",
-   "la",
-   "lb",
-   "ln",
-   "lo",
-   "lt",
-   "lv",
-   "mg",
-   "mi",
-   "mk",
-   "ml",
-   "mn",
-   "mr",
-   "ms",
-   "mt",
-   "my",
-   "ne",
-   "nl",
-   "nn",
-   "no",
-   "oc",
-   "pa",
-   "pl",
-   "ps",
-   "pt",
-   "ro",
-   "ru",
-   "sa",
-   "sd",
-   "si",
-   "sk",
-   "sl",
-   "sn",
-   "so",
-   "sq",
-   "sr",
-   "su",
-   "sv",
-   "sw",
-   "ta",
-   "te",
-   "tg",
-   "th",
-   "tk",
-   "tl",
-   "tr",
-   "tt",
-   "uk",
-   "ur",
-   "uz",
-   "vi",
-   "yi",
-   "yo",
-   "zh",
-   "yue",
+    "af",
+    "am",
+    "ar",
+    "as",
+    "az",
+    "ba",
+    "be",
+    "bg",
+    "bn",
+    "bo",
+    "br",
+    "bs",
+    "ca",
+    "cs",
+    "cy",
+    "da",
+    "de",
+    "el",
+    "en",
+    "es",
+    "et",
+    "eu",
+    "fa",
+    "fi",
+    "fo",
+    "fr",
+    "gl",
+    "gu",
+    "ha",
+    "haw",
+    "he",
+    "hi",
+    "hr",
+    "ht",
+    "hu",
+    "hy",
+    "id",
+    "is",
+    "it",
+    "ja",
+    "jw",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "la",
+    "lb",
+    "ln",
+    "lo",
+    "lt",
+    "lv",
+    "mg",
+    "mi",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "my",
+    "ne",
+    "nl",
+    "nn",
+    "no",
+    "oc",
+    "pa",
+    "pl",
+    "ps",
+    "pt",
+    "ro",
+    "ru",
+    "sa",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "su",
+    "sv",
+    "sw",
+    "ta",
+    "te",
+    "tg",
+    "th",
+    "tk",
+    "tl",
+    "tr",
+    "tt",
+    "uk",
+    "ur",
+    "uz",
+    "vi",
+    "yi",
+    "yo",
+    "zh",
+    "yue",
 )

From 6ae3dee491d7662c1803776a5080dda812102aea Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 16:42:56 -0400
Subject: [PATCH 3/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 3eae92a5..8583d02c 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -107,7 +107,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
                 raise ValueError("Empty token sequence")
             if any(not isinstance(t, int) or t < 0 for t in tokens):
                 raise ValueError("Invalid token values detected")
-            
+
             outputs = [[]]
             for token in tokens:
                 if token >= self.timestamp_begin:
@@ -120,7 +120,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
             decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
             if not any(decoded):
                 raise ValueError("Decoding produced no valid output")
-                
+
             return "".join(decoded)
         except Exception as e:
             raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e

From b259749790bccbb6ea7b98739de683c6e7ee0e68 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 16:48:47 -0400
Subject: [PATCH 4/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 8583d02c..925f3086 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -1,5 +1,4 @@
 import string
-
 from functools import cached_property
 from typing import List, Optional, Tuple
 
@@ -117,13 +116,17 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
                 else:
                     outputs[-1].append(token)
 
-            decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+            decoded = [
+                s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs
+            ]
             if not any(decoded):
                 raise ValueError("Decoding produced no valid output")
 
             return "".join(decoded)
         except Exception as e:
-            raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e
+            raise TokenizationError(
+                f"Failed to decode tokens with timestamps: {e}"
+            ) from e
 
     @cached_property
     def non_speech_tokens(self) -> Tuple[int]:
@@ -225,10 +228,7 @@ def split_tokens_on_spaces(
         return words, word_tokens
 
 
-_TASKS = (
-    "transcribe",
-    "translate",
-)
+_TASKS = ("transcribe", "translate")
 
 _LANGUAGE_CODES = (
     "af",

From f6ec789d1c9bfc373b19c1341d820ac4ea0acc18 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 16:55:10 -0400
Subject: [PATCH 5/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 925f3086..659ffb25 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -1,6 +1,6 @@
-import string
 from functools import cached_property
 from typing import List, Optional, Tuple
+import string
 
 import tokenizers
 

From dce7d132b841f269103a1329ffb775f6bc469568 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 17:02:10 -0400
Subject: [PATCH 6/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 659ffb25..925f3086 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -1,6 +1,6 @@
+import string
 from functools import cached_property
 from typing import List, Optional, Tuple
-import string
 
 import tokenizers
 

From 3753fded004a6d8489b5ae3bf6f92c96f0c3c8d9 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 17:05:27 -0400
Subject: [PATCH 7/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 925f3086..6e6c6434 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -1,4 +1,5 @@
 import string
+
 from functools import cached_property
 from typing import List, Optional, Tuple
 

From 9e5944f8d72e8d74c4fb99d4fceb174ef9fcec28 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 17:16:04 -0400
Subject: [PATCH 8/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 6e6c6434..d0c89421 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -94,20 +94,20 @@ def decode(self, tokens: List[int]) -> str:
         try:
             text_tokens = [token for token in tokens if token < self.eot]
             if not text_tokens:
-                raise ValueError("No valid text tokens found")
+                return ""
             if any(not isinstance(t, int) or t < 0 for t in text_tokens):
                 raise ValueError("Invalid token values detected")
             return self.tokenizer.decode(text_tokens)
         except Exception as e:
             raise TokenizationError(f"Failed to decode tokens: {e}") from e
-
+    
     def decode_with_timestamps(self, tokens: List[int]) -> str:
         try:
             if not tokens:
                 raise ValueError("Empty token sequence")
             if any(not isinstance(t, int) or t < 0 for t in tokens):
                 raise ValueError("Invalid token values detected")
-
+    
             outputs = [[]]
             for token in tokens:
                 if token >= self.timestamp_begin:
@@ -116,13 +116,14 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
                     outputs.append([])
                 else:
                     outputs[-1].append(token)
-
+    
             decoded = [
                 s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs
             ]
-            if not any(decoded):
-                raise ValueError("Decoding produced no valid output")
-
+            
+            if not any(decoded) and not any(isinstance(s, str) for s in outputs):
+                return ""
+    
             return "".join(decoded)
         except Exception as e:
             raise TokenizationError(

From 53bd629af1e30d9f12c13c77606a4ccfd6a319a0 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 26 Oct 2024 17:41:08 -0400
Subject: [PATCH 9/9] Update tokenizer.py

---
 faster_whisper/tokenizer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index d0c89421..cc208829 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -100,14 +100,14 @@ def decode(self, tokens: List[int]) -> str:
             return self.tokenizer.decode(text_tokens)
         except Exception as e:
             raise TokenizationError(f"Failed to decode tokens: {e}") from e
-    
+
     def decode_with_timestamps(self, tokens: List[int]) -> str:
         try:
             if not tokens:
                 raise ValueError("Empty token sequence")
             if any(not isinstance(t, int) or t < 0 for t in tokens):
                 raise ValueError("Invalid token values detected")
-    
+
             outputs = [[]]
             for token in tokens:
                 if token >= self.timestamp_begin:
@@ -116,14 +116,14 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
                     outputs.append([])
                 else:
                     outputs[-1].append(token)
-    
+
             decoded = [
                 s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs
             ]
-            
+
             if not any(decoded) and not any(isinstance(s, str) for s in outputs):
                 return ""
-    
+
             return "".join(decoded)
         except Exception as e:
             raise TokenizationError(