From 7677d8e66be1f347c629ecd02247bc84a7b5eaea Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Fri, 25 Oct 2024 10:04:33 -0400 Subject: [PATCH 1/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 616 +++++++++++++++++++----------------- 1 file changed, 318 insertions(+), 298 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 3bf76a5f..8bcf2c8d 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -6,309 +6,329 @@ import tokenizers +class TokenizationError(Exception): + pass + + class Tokenizer: - """Simple wrapper around a tokenizers.Tokenizer.""" - - def __init__( - self, - tokenizer: tokenizers.Tokenizer, - multilingual: bool, - task: Optional[str] = None, - language: Optional[str] = None, - ): - self.tokenizer = tokenizer - - if multilingual: - if task not in _TASKS: - raise ValueError( - "'%s' is not a valid task (accepted tasks: %s)" - % (task, ", ".join(_TASKS)) - ) - - if language not in _LANGUAGE_CODES: - raise ValueError( - "'%s' is not a valid language code (accepted language codes: %s)" - % (language, ", ".join(_LANGUAGE_CODES)) - ) - - self.task = self.tokenizer.token_to_id("<|%s|>" % task) - self.language = self.tokenizer.token_to_id("<|%s|>" % language) - self.language_code = language - else: - self.task = None - self.language = None - self.language_code = "en" - - @cached_property - def transcribe(self) -> int: - return self.tokenizer.token_to_id("<|transcribe|>") - - @cached_property - def translate(self) -> int: - return self.tokenizer.token_to_id("<|translate|>") - - @cached_property - def sot(self) -> int: - return self.tokenizer.token_to_id("<|startoftranscript|>") - - @cached_property - def sot_lm(self) -> int: - return self.tokenizer.token_to_id("<|startoflm|>") - - @cached_property - def sot_prev(self) -> int: - return self.tokenizer.token_to_id("<|startofprev|>") - - @cached_property - def eot(self) -> int: - return self.tokenizer.token_to_id("<|endoftext|>") - - @cached_property - def no_timestamps(self) -> int: - return self.tokenizer.token_to_id("<|notimestamps|>") - - @property - def timestamp_begin(self) -> int: - return self.no_timestamps + 1 - - @property - def sot_sequence(self) -> List[int]: - sequence = [self.sot] - - if self.language is not None: - sequence.append(self.language) - - if self.task is not None: - sequence.append(self.task) - - return sequence - - def encode(self, text: str) -> List[int]: - return self.tokenizer.encode(text, add_special_tokens=False).ids - - def decode(self, tokens: List[int]) -> str: - text_tokens = [token for token in tokens if token < self.eot] - return self.tokenizer.decode(text_tokens) - - def decode_with_timestamps(self, tokens: List[int]) -> str: - outputs = [[]] - - for token in tokens: - if token >= self.timestamp_begin: - timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" - outputs.append(timestamp) - outputs.append([]) - else: - outputs[-1].append(token) - - return "".join( - [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] - ) - - @cached_property - def non_speech_tokens(self) -> Tuple[int]: - """ - Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech - annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. - - - ♪♪♪ - - ( SPEAKING FOREIGN LANGUAGE ) - - [DAVID] Hey there, - - keeping basic punctuations like commas, periods, question marks, exclamation points, etc. - """ - symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') - symbols += ( - "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() - ) - - # symbols that may be a single token or multiple tokens depending on the tokenizer. - # In case they're multiple tokens, suppress the first token, which is safe because: - # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress - # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. - miscellaneous = set("♩♪♫♬♭♮♯") - assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) - - # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word - result = {self.encode(" -")[0], self.encode(" '")[0]} - for symbol in symbols + list(miscellaneous): - for tokens in [ - self.encode(symbol), - self.encode(" " + symbol), - ]: - if len(tokens) == 1 or symbol in miscellaneous: - result.add(tokens[0]) - - return tuple(sorted(result)) - - def split_to_word_tokens( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: - # These languages don't typically use spaces, so it is difficult to split words - # without morpheme analysis. Here, we instead split words at any - # position where the tokens are decoded as valid unicode points - return self.split_tokens_on_unicode(tokens) - - return self.split_tokens_on_spaces(tokens) - - def split_tokens_on_unicode( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - decoded_full = self.decode_with_timestamps(tokens) - replacement_char = "\ufffd" - - words = [] - word_tokens = [] - current_tokens = [] - unicode_offset = 0 - - for token in tokens: - current_tokens.append(token) - decoded = self.decode_with_timestamps(current_tokens) - - try: - replacement_char_index = decoded.index(replacement_char) - replacement_char_index += unicode_offset - except ValueError: - replacement_char_index = None - - if replacement_char_index is None or ( - replacement_char_index < len(decoded_full) - and decoded_full[replacement_char_index] == replacement_char - ): - words.append(decoded) - word_tokens.append(current_tokens) - current_tokens = [] - unicode_offset += len(decoded) - - return words, word_tokens - - def split_tokens_on_spaces( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) - words = [] - word_tokens = [] - - for subword, subword_tokens in zip(subwords, subword_tokens_list): - special = subword_tokens[0] >= self.eot - with_space = subword.startswith(" ") - punctuation = subword.strip() in string.punctuation - if special or with_space or punctuation or len(words) == 0: - words.append(subword) - word_tokens.append(subword_tokens) - else: - words[-1] = words[-1] + subword - word_tokens[-1].extend(subword_tokens) - - return words, word_tokens + """Simple wrapper around a tokenizers.Tokenizer.""" + + def __init__( + self, + tokenizer: tokenizers.Tokenizer, + multilingual: bool, + task: Optional[str] = None, + language: Optional[str] = None, + ): + self.tokenizer = tokenizer + + if multilingual: + if task not in _TASKS: + raise ValueError( + "'%s' is not a valid task (accepted tasks: %s)" + % (task, ", ".join(_TASKS)) + ) + + if language not in _LANGUAGE_CODES: + raise ValueError( + "'%s' is not a valid language code (accepted language codes: %s)" + % (language, ", ".join(_LANGUAGE_CODES)) + ) + + self.task = self.tokenizer.token_to_id("<|%s|>" % task) + self.language = self.tokenizer.token_to_id("<|%s|>" % language) + self.language_code = language + else: + self.task = None + self.language = None + self.language_code = "en" + + @cached_property + def transcribe(self) -> int: + return self.tokenizer.token_to_id("<|transcribe|>") + + @cached_property + def translate(self) -> int: + return self.tokenizer.token_to_id("<|translate|>") + + @cached_property + def sot(self) -> int: + return self.tokenizer.token_to_id("<|startoftranscript|>") + + @cached_property + def sot_lm(self) -> int: + return self.tokenizer.token_to_id("<|startoflm|>") + + @cached_property + def sot_prev(self) -> int: + return self.tokenizer.token_to_id("<|startofprev|>") + + @cached_property + def eot(self) -> int: + return self.tokenizer.token_to_id("<|endoftext|>") + + @cached_property + def no_timestamps(self) -> int: + return self.tokenizer.token_to_id("<|notimestamps|>") + + @property + def timestamp_begin(self) -> int: + return self.no_timestamps + 1 + + @property + def sot_sequence(self) -> List[int]: + sequence = [self.sot] + + if self.language is not None: + sequence.append(self.language) + + if self.task is not None: + sequence.append(self.task) + + return sequence + + def encode(self, text: str) -> List[int]: + return self.tokenizer.encode(text, add_special_tokens=False).ids + + def decode(self, tokens: List[int]) -> str: + try: + text_tokens = [token for token in tokens if token < self.eot] + if not text_tokens: + raise ValueError("No valid text tokens found") + if any(not isinstance(t, int) or t < 0 for t in text_tokens): + raise ValueError("Invalid token values detected") + return self.tokenizer.decode(text_tokens) + except Exception as e: + raise TokenizationError(f"Failed to decode tokens: {e}") from e + + def decode_with_timestamps(self, tokens: List[int]) -> str: + try: + if not tokens: + raise ValueError("Empty token sequence") + if any(not isinstance(t, int) or t < 0 for t in tokens): + raise ValueError("Invalid token values detected") + + outputs = [[]] + for token in tokens: + if token >= self.timestamp_begin: + timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" + outputs.append(timestamp) + outputs.append([]) + else: + outputs[-1].append(token) + + decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] + if not any(decoded): + raise ValueError("Decoding produced no valid output") + + return "".join(decoded) + except Exception as e: + raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e + + @cached_property + def non_speech_tokens(self) -> Tuple[int]: + """ + Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech + annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. + + - ♪♪♪ + - ( SPEAKING FOREIGN LANGUAGE ) + - [DAVID] Hey there, + + keeping basic punctuations like commas, periods, question marks, exclamation points, etc. + """ + symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') + symbols += ( + "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() + ) + + # symbols that may be a single token or multiple tokens depending on the tokenizer. + # In case they're multiple tokens, suppress the first token, which is safe because: + # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress + # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. + miscellaneous = set("♩♪♫♬♭♮♯") + assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + + # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word + result = {self.encode(" -")[0], self.encode(" '")[0]} + for symbol in symbols + list(miscellaneous): + for tokens in [ + self.encode(symbol), + self.encode(" " + symbol), + ]: + if len(tokens) == 1 or symbol in miscellaneous: + result.add(tokens[0]) + + return tuple(sorted(result)) + + def split_to_word_tokens( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: + # These languages don't typically use spaces, so it is difficult to split words + # without morpheme analysis. Here, we instead split words at any + # position where the tokens are decoded as valid unicode points + return self.split_tokens_on_unicode(tokens) + + return self.split_tokens_on_spaces(tokens) + + def split_tokens_on_unicode( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + decoded_full = self.decode_with_timestamps(tokens) + replacement_char = "\ufffd" + + words = [] + word_tokens = [] + current_tokens = [] + unicode_offset = 0 + + for token in tokens: + current_tokens.append(token) + decoded = self.decode_with_timestamps(current_tokens) + + try: + replacement_char_index = decoded.index(replacement_char) + replacement_char_index += unicode_offset + except ValueError: + replacement_char_index = None + + if replacement_char_index is None or ( + replacement_char_index < len(decoded_full) + and decoded_full[replacement_char_index] == replacement_char + ): + words.append(decoded) + word_tokens.append(current_tokens) + current_tokens = [] + unicode_offset += len(decoded) + + return words, word_tokens + + def split_tokens_on_spaces( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) + words = [] + word_tokens = [] + + for subword, subword_tokens in zip(subwords, subword_tokens_list): + special = subword_tokens[0] >= self.eot + with_space = subword.startswith(" ") + punctuation = subword.strip() in string.punctuation + if special or with_space or punctuation or len(words) == 0: + words.append(subword) + word_tokens.append(subword_tokens) + else: + words[-1] = words[-1] + subword + word_tokens[-1].extend(subword_tokens) + + return words, word_tokens _TASKS = ( - "transcribe", - "translate", + "transcribe", + "translate", ) _LANGUAGE_CODES = ( - "af", - "am", - "ar", - "as", - "az", - "ba", - "be", - "bg", - "bn", - "bo", - "br", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "el", - "en", - "es", - "et", - "eu", - "fa", - "fi", - "fo", - "fr", - "gl", - "gu", - "ha", - "haw", - "he", - "hi", - "hr", - "ht", - "hu", - "hy", - "id", - "is", - "it", - "ja", - "jw", - "ka", - "kk", - "km", - "kn", - "ko", - "la", - "lb", - "ln", - "lo", - "lt", - "lv", - "mg", - "mi", - "mk", - "ml", - "mn", - "mr", - "ms", - "mt", - "my", - "ne", - "nl", - "nn", - "no", - "oc", - "pa", - "pl", - "ps", - "pt", - "ro", - "ru", - "sa", - "sd", - "si", - "sk", - "sl", - "sn", - "so", - "sq", - "sr", - "su", - "sv", - "sw", - "ta", - "te", - "tg", - "th", - "tk", - "tl", - "tr", - "tt", - "uk", - "ur", - "uz", - "vi", - "yi", - "yo", - "zh", - "yue", + "af", + "am", + "ar", + "as", + "az", + "ba", + "be", + "bg", + "bn", + "bo", + "br", + "bs", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "eu", + "fa", + "fi", + "fo", + "fr", + "gl", + "gu", + "ha", + "haw", + "he", + "hi", + "hr", + "ht", + "hu", + "hy", + "id", + "is", + "it", + "ja", + "jw", + "ka", + "kk", + "km", + "kn", + "ko", + "la", + "lb", + "ln", + "lo", + "lt", + "lv", + "mg", + "mi", + "mk", + "ml", + "mn", + "mr", + "ms", + "mt", + "my", + "ne", + "nl", + "nn", + "no", + "oc", + "pa", + "pl", + "ps", + "pt", + "ro", + "ru", + "sa", + "sd", + "si", + "sk", + "sl", + "sn", + "so", + "sq", + "sr", + "su", + "sv", + "sw", + "ta", + "te", + "tg", + "th", + "tk", + "tl", + "tr", + "tt", + "uk", + "ur", + "uz", + "vi", + "yi", + "yo", + "zh", + "yue", ) From 6aee94e761535fc6a10f39460b0cd2550b2aaa5f Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 16:38:39 -0400 Subject: [PATCH 2/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 630 ++++++++++++++++++------------------ 1 file changed, 315 insertions(+), 315 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 8bcf2c8d..3eae92a5 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -7,328 +7,328 @@ class TokenizationError(Exception): - pass + pass class Tokenizer: - """Simple wrapper around a tokenizers.Tokenizer.""" - - def __init__( - self, - tokenizer: tokenizers.Tokenizer, - multilingual: bool, - task: Optional[str] = None, - language: Optional[str] = None, - ): - self.tokenizer = tokenizer - - if multilingual: - if task not in _TASKS: - raise ValueError( - "'%s' is not a valid task (accepted tasks: %s)" - % (task, ", ".join(_TASKS)) - ) - - if language not in _LANGUAGE_CODES: - raise ValueError( - "'%s' is not a valid language code (accepted language codes: %s)" - % (language, ", ".join(_LANGUAGE_CODES)) - ) - - self.task = self.tokenizer.token_to_id("<|%s|>" % task) - self.language = self.tokenizer.token_to_id("<|%s|>" % language) - self.language_code = language - else: - self.task = None - self.language = None - self.language_code = "en" - - @cached_property - def transcribe(self) -> int: - return self.tokenizer.token_to_id("<|transcribe|>") - - @cached_property - def translate(self) -> int: - return self.tokenizer.token_to_id("<|translate|>") - - @cached_property - def sot(self) -> int: - return self.tokenizer.token_to_id("<|startoftranscript|>") - - @cached_property - def sot_lm(self) -> int: - return self.tokenizer.token_to_id("<|startoflm|>") - - @cached_property - def sot_prev(self) -> int: - return self.tokenizer.token_to_id("<|startofprev|>") - - @cached_property - def eot(self) -> int: - return self.tokenizer.token_to_id("<|endoftext|>") - - @cached_property - def no_timestamps(self) -> int: - return self.tokenizer.token_to_id("<|notimestamps|>") - - @property - def timestamp_begin(self) -> int: - return self.no_timestamps + 1 - - @property - def sot_sequence(self) -> List[int]: - sequence = [self.sot] - - if self.language is not None: - sequence.append(self.language) - - if self.task is not None: - sequence.append(self.task) - - return sequence - - def encode(self, text: str) -> List[int]: - return self.tokenizer.encode(text, add_special_tokens=False).ids - - def decode(self, tokens: List[int]) -> str: - try: - text_tokens = [token for token in tokens if token < self.eot] - if not text_tokens: - raise ValueError("No valid text tokens found") - if any(not isinstance(t, int) or t < 0 for t in text_tokens): - raise ValueError("Invalid token values detected") - return self.tokenizer.decode(text_tokens) - except Exception as e: - raise TokenizationError(f"Failed to decode tokens: {e}") from e - - def decode_with_timestamps(self, tokens: List[int]) -> str: - try: - if not tokens: - raise ValueError("Empty token sequence") - if any(not isinstance(t, int) or t < 0 for t in tokens): - raise ValueError("Invalid token values detected") - - outputs = [[]] - for token in tokens: - if token >= self.timestamp_begin: - timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" - outputs.append(timestamp) - outputs.append([]) - else: - outputs[-1].append(token) - - decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] - if not any(decoded): - raise ValueError("Decoding produced no valid output") - - return "".join(decoded) - except Exception as e: - raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e - - @cached_property - def non_speech_tokens(self) -> Tuple[int]: - """ - Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech - annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. - - - ♪♪♪ - - ( SPEAKING FOREIGN LANGUAGE ) - - [DAVID] Hey there, - - keeping basic punctuations like commas, periods, question marks, exclamation points, etc. - """ - symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') - symbols += ( - "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() - ) - - # symbols that may be a single token or multiple tokens depending on the tokenizer. - # In case they're multiple tokens, suppress the first token, which is safe because: - # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress - # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. - miscellaneous = set("♩♪♫♬♭♮♯") - assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) - - # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word - result = {self.encode(" -")[0], self.encode(" '")[0]} - for symbol in symbols + list(miscellaneous): - for tokens in [ - self.encode(symbol), - self.encode(" " + symbol), - ]: - if len(tokens) == 1 or symbol in miscellaneous: - result.add(tokens[0]) - - return tuple(sorted(result)) - - def split_to_word_tokens( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: - # These languages don't typically use spaces, so it is difficult to split words - # without morpheme analysis. Here, we instead split words at any - # position where the tokens are decoded as valid unicode points - return self.split_tokens_on_unicode(tokens) - - return self.split_tokens_on_spaces(tokens) - - def split_tokens_on_unicode( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - decoded_full = self.decode_with_timestamps(tokens) - replacement_char = "\ufffd" - - words = [] - word_tokens = [] - current_tokens = [] - unicode_offset = 0 - - for token in tokens: - current_tokens.append(token) - decoded = self.decode_with_timestamps(current_tokens) - - try: - replacement_char_index = decoded.index(replacement_char) - replacement_char_index += unicode_offset - except ValueError: - replacement_char_index = None - - if replacement_char_index is None or ( - replacement_char_index < len(decoded_full) - and decoded_full[replacement_char_index] == replacement_char - ): - words.append(decoded) - word_tokens.append(current_tokens) - current_tokens = [] - unicode_offset += len(decoded) - - return words, word_tokens - - def split_tokens_on_spaces( - self, tokens: List[int] - ) -> Tuple[List[str], List[List[int]]]: - subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) - words = [] - word_tokens = [] - - for subword, subword_tokens in zip(subwords, subword_tokens_list): - special = subword_tokens[0] >= self.eot - with_space = subword.startswith(" ") - punctuation = subword.strip() in string.punctuation - if special or with_space or punctuation or len(words) == 0: - words.append(subword) - word_tokens.append(subword_tokens) - else: - words[-1] = words[-1] + subword - word_tokens[-1].extend(subword_tokens) - - return words, word_tokens + """Simple wrapper around a tokenizers.Tokenizer.""" + + def __init__( + self, + tokenizer: tokenizers.Tokenizer, + multilingual: bool, + task: Optional[str] = None, + language: Optional[str] = None, + ): + self.tokenizer = tokenizer + + if multilingual: + if task not in _TASKS: + raise ValueError( + "'%s' is not a valid task (accepted tasks: %s)" + % (task, ", ".join(_TASKS)) + ) + + if language not in _LANGUAGE_CODES: + raise ValueError( + "'%s' is not a valid language code (accepted language codes: %s)" + % (language, ", ".join(_LANGUAGE_CODES)) + ) + + self.task = self.tokenizer.token_to_id("<|%s|>" % task) + self.language = self.tokenizer.token_to_id("<|%s|>" % language) + self.language_code = language + else: + self.task = None + self.language = None + self.language_code = "en" + + @cached_property + def transcribe(self) -> int: + return self.tokenizer.token_to_id("<|transcribe|>") + + @cached_property + def translate(self) -> int: + return self.tokenizer.token_to_id("<|translate|>") + + @cached_property + def sot(self) -> int: + return self.tokenizer.token_to_id("<|startoftranscript|>") + + @cached_property + def sot_lm(self) -> int: + return self.tokenizer.token_to_id("<|startoflm|>") + + @cached_property + def sot_prev(self) -> int: + return self.tokenizer.token_to_id("<|startofprev|>") + + @cached_property + def eot(self) -> int: + return self.tokenizer.token_to_id("<|endoftext|>") + + @cached_property + def no_timestamps(self) -> int: + return self.tokenizer.token_to_id("<|notimestamps|>") + + @property + def timestamp_begin(self) -> int: + return self.no_timestamps + 1 + + @property + def sot_sequence(self) -> List[int]: + sequence = [self.sot] + + if self.language is not None: + sequence.append(self.language) + + if self.task is not None: + sequence.append(self.task) + + return sequence + + def encode(self, text: str) -> List[int]: + return self.tokenizer.encode(text, add_special_tokens=False).ids + + def decode(self, tokens: List[int]) -> str: + try: + text_tokens = [token for token in tokens if token < self.eot] + if not text_tokens: + raise ValueError("No valid text tokens found") + if any(not isinstance(t, int) or t < 0 for t in text_tokens): + raise ValueError("Invalid token values detected") + return self.tokenizer.decode(text_tokens) + except Exception as e: + raise TokenizationError(f"Failed to decode tokens: {e}") from e + + def decode_with_timestamps(self, tokens: List[int]) -> str: + try: + if not tokens: + raise ValueError("Empty token sequence") + if any(not isinstance(t, int) or t < 0 for t in tokens): + raise ValueError("Invalid token values detected") + + outputs = [[]] + for token in tokens: + if token >= self.timestamp_begin: + timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" + outputs.append(timestamp) + outputs.append([]) + else: + outputs[-1].append(token) + + decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] + if not any(decoded): + raise ValueError("Decoding produced no valid output") + + return "".join(decoded) + except Exception as e: + raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e + + @cached_property + def non_speech_tokens(self) -> Tuple[int]: + """ + Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech + annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. + + - ♪♪♪ + - ( SPEAKING FOREIGN LANGUAGE ) + - [DAVID] Hey there, + + keeping basic punctuations like commas, periods, question marks, exclamation points, etc. + """ + symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') + symbols += ( + "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() + ) + + # symbols that may be a single token or multiple tokens depending on the tokenizer. + # In case they're multiple tokens, suppress the first token, which is safe because: + # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress + # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. + miscellaneous = set("♩♪♫♬♭♮♯") + assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + + # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word + result = {self.encode(" -")[0], self.encode(" '")[0]} + for symbol in symbols + list(miscellaneous): + for tokens in [ + self.encode(symbol), + self.encode(" " + symbol), + ]: + if len(tokens) == 1 or symbol in miscellaneous: + result.add(tokens[0]) + + return tuple(sorted(result)) + + def split_to_word_tokens( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: + # These languages don't typically use spaces, so it is difficult to split words + # without morpheme analysis. Here, we instead split words at any + # position where the tokens are decoded as valid unicode points + return self.split_tokens_on_unicode(tokens) + + return self.split_tokens_on_spaces(tokens) + + def split_tokens_on_unicode( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + decoded_full = self.decode_with_timestamps(tokens) + replacement_char = "\ufffd" + + words = [] + word_tokens = [] + current_tokens = [] + unicode_offset = 0 + + for token in tokens: + current_tokens.append(token) + decoded = self.decode_with_timestamps(current_tokens) + + try: + replacement_char_index = decoded.index(replacement_char) + replacement_char_index += unicode_offset + except ValueError: + replacement_char_index = None + + if replacement_char_index is None or ( + replacement_char_index < len(decoded_full) + and decoded_full[replacement_char_index] == replacement_char + ): + words.append(decoded) + word_tokens.append(current_tokens) + current_tokens = [] + unicode_offset += len(decoded) + + return words, word_tokens + + def split_tokens_on_spaces( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) + words = [] + word_tokens = [] + + for subword, subword_tokens in zip(subwords, subword_tokens_list): + special = subword_tokens[0] >= self.eot + with_space = subword.startswith(" ") + punctuation = subword.strip() in string.punctuation + if special or with_space or punctuation or len(words) == 0: + words.append(subword) + word_tokens.append(subword_tokens) + else: + words[-1] = words[-1] + subword + word_tokens[-1].extend(subword_tokens) + + return words, word_tokens _TASKS = ( - "transcribe", - "translate", + "transcribe", + "translate", ) _LANGUAGE_CODES = ( - "af", - "am", - "ar", - "as", - "az", - "ba", - "be", - "bg", - "bn", - "bo", - "br", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "el", - "en", - "es", - "et", - "eu", - "fa", - "fi", - "fo", - "fr", - "gl", - "gu", - "ha", - "haw", - "he", - "hi", - "hr", - "ht", - "hu", - "hy", - "id", - "is", - "it", - "ja", - "jw", - "ka", - "kk", - "km", - "kn", - "ko", - "la", - "lb", - "ln", - "lo", - "lt", - "lv", - "mg", - "mi", - "mk", - "ml", - "mn", - "mr", - "ms", - "mt", - "my", - "ne", - "nl", - "nn", - "no", - "oc", - "pa", - "pl", - "ps", - "pt", - "ro", - "ru", - "sa", - "sd", - "si", - "sk", - "sl", - "sn", - "so", - "sq", - "sr", - "su", - "sv", - "sw", - "ta", - "te", - "tg", - "th", - "tk", - "tl", - "tr", - "tt", - "uk", - "ur", - "uz", - "vi", - "yi", - "yo", - "zh", - "yue", + "af", + "am", + "ar", + "as", + "az", + "ba", + "be", + "bg", + "bn", + "bo", + "br", + "bs", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "eu", + "fa", + "fi", + "fo", + "fr", + "gl", + "gu", + "ha", + "haw", + "he", + "hi", + "hr", + "ht", + "hu", + "hy", + "id", + "is", + "it", + "ja", + "jw", + "ka", + "kk", + "km", + "kn", + "ko", + "la", + "lb", + "ln", + "lo", + "lt", + "lv", + "mg", + "mi", + "mk", + "ml", + "mn", + "mr", + "ms", + "mt", + "my", + "ne", + "nl", + "nn", + "no", + "oc", + "pa", + "pl", + "ps", + "pt", + "ro", + "ru", + "sa", + "sd", + "si", + "sk", + "sl", + "sn", + "so", + "sq", + "sr", + "su", + "sv", + "sw", + "ta", + "te", + "tg", + "th", + "tk", + "tl", + "tr", + "tt", + "uk", + "ur", + "uz", + "vi", + "yi", + "yo", + "zh", + "yue", ) From 6ae3dee491d7662c1803776a5080dda812102aea Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 16:42:56 -0400 Subject: [PATCH 3/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 3eae92a5..8583d02c 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -107,7 +107,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: raise ValueError("Empty token sequence") if any(not isinstance(t, int) or t < 0 for t in tokens): raise ValueError("Invalid token values detected") - + outputs = [[]] for token in tokens: if token >= self.timestamp_begin: @@ -120,7 +120,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] if not any(decoded): raise ValueError("Decoding produced no valid output") - + return "".join(decoded) except Exception as e: raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e From b259749790bccbb6ea7b98739de683c6e7ee0e68 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 16:48:47 -0400 Subject: [PATCH 4/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 8583d02c..925f3086 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -1,5 +1,4 @@ import string - from functools import cached_property from typing import List, Optional, Tuple @@ -117,13 +116,17 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: else: outputs[-1].append(token) - decoded = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] + decoded = [ + s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs + ] if not any(decoded): raise ValueError("Decoding produced no valid output") return "".join(decoded) except Exception as e: - raise TokenizationError(f"Failed to decode tokens with timestamps: {e}") from e + raise TokenizationError( + f"Failed to decode tokens with timestamps: {e}" + ) from e @cached_property def non_speech_tokens(self) -> Tuple[int]: @@ -225,10 +228,7 @@ def split_tokens_on_spaces( return words, word_tokens -_TASKS = ( - "transcribe", - "translate", -) +_TASKS = ("transcribe", "translate") _LANGUAGE_CODES = ( "af", From f6ec789d1c9bfc373b19c1341d820ac4ea0acc18 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 16:55:10 -0400 Subject: [PATCH 5/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 925f3086..659ffb25 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -1,6 +1,6 @@ -import string from functools import cached_property from typing import List, Optional, Tuple +import string import tokenizers From dce7d132b841f269103a1329ffb775f6bc469568 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 17:02:10 -0400 Subject: [PATCH 6/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 659ffb25..925f3086 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -1,6 +1,6 @@ +import string from functools import cached_property from typing import List, Optional, Tuple -import string import tokenizers From 3753fded004a6d8489b5ae3bf6f92c96f0c3c8d9 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 17:05:27 -0400 Subject: [PATCH 7/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 925f3086..6e6c6434 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -1,4 +1,5 @@ import string + from functools import cached_property from typing import List, Optional, Tuple From 9e5944f8d72e8d74c4fb99d4fceb174ef9fcec28 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 17:16:04 -0400 Subject: [PATCH 8/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 6e6c6434..d0c89421 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -94,20 +94,20 @@ def decode(self, tokens: List[int]) -> str: try: text_tokens = [token for token in tokens if token < self.eot] if not text_tokens: - raise ValueError("No valid text tokens found") + return "" if any(not isinstance(t, int) or t < 0 for t in text_tokens): raise ValueError("Invalid token values detected") return self.tokenizer.decode(text_tokens) except Exception as e: raise TokenizationError(f"Failed to decode tokens: {e}") from e - + def decode_with_timestamps(self, tokens: List[int]) -> str: try: if not tokens: raise ValueError("Empty token sequence") if any(not isinstance(t, int) or t < 0 for t in tokens): raise ValueError("Invalid token values detected") - + outputs = [[]] for token in tokens: if token >= self.timestamp_begin: @@ -116,13 +116,14 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: outputs.append([]) else: outputs[-1].append(token) - + decoded = [ s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs ] - if not any(decoded): - raise ValueError("Decoding produced no valid output") - + + if not any(decoded) and not any(isinstance(s, str) for s in outputs): + return "" + return "".join(decoded) except Exception as e: raise TokenizationError( From 53bd629af1e30d9f12c13c77606a4ccfd6a319a0 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 26 Oct 2024 17:41:08 -0400 Subject: [PATCH 9/9] Update tokenizer.py --- faster_whisper/tokenizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index d0c89421..cc208829 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -100,14 +100,14 @@ def decode(self, tokens: List[int]) -> str: return self.tokenizer.decode(text_tokens) except Exception as e: raise TokenizationError(f"Failed to decode tokens: {e}") from e - + def decode_with_timestamps(self, tokens: List[int]) -> str: try: if not tokens: raise ValueError("Empty token sequence") if any(not isinstance(t, int) or t < 0 for t in tokens): raise ValueError("Invalid token values detected") - + outputs = [[]] for token in tokens: if token >= self.timestamp_begin: @@ -116,14 +116,14 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: outputs.append([]) else: outputs[-1].append(token) - + decoded = [ s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs ] - + if not any(decoded) and not any(isinstance(s, str) for s in outputs): return "" - + return "".join(decoded) except Exception as e: raise TokenizationError(