diff --git a/README.md b/README.md index 54d74450..44685e6f 100644 --- a/README.md +++ b/README.md @@ -165,12 +165,12 @@ Models can also be converted from the code. See the [conversion API](https://ope 1. Directly load the model from a local directory: ```python -model = faster_whisper.WhisperModel('whisper-large-v2-ct2') +model = faster_whisper.WhisperModel("whisper-large-v2-ct2") ``` 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: ```python -model = faster_whisper.WhisperModel('username/whisper-large-v2-ct2') +model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2") ``` ## Comparing performance against other implementations diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ea0f4c40..9d96d566 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -699,6 +699,8 @@ def get_prompt( prefix_tokens = tokenizer.encode(" " + prefix.strip()) if len(prefix_tokens) >= self.max_length // 2: prefix_tokens = prefix_tokens[: self.max_length // 2 - 1] + if not without_timestamps: + prompt.append(tokenizer.timestamp_begin) prompt.extend(prefix_tokens) return prompt @@ -733,8 +735,6 @@ def add_word_timestamps( # hack: truncate long words at sentence boundaries. # a better segmentation algorithm based on VAD should be able to replace this. if len(word_durations) > 0: - median_duration = np.median(word_durations) - max_duration = median_duration * 2 sentence_end_marks = ".。!!??" # ensure words at sentence boundaries # are not longer than twice the median word duration. diff --git a/faster_whisper/version.py b/faster_whisper/version.py index bf288f06..26a803c0 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.6.0" +__version__ = "0.7.0" diff --git a/requirements.txt b/requirements.txt index 4dd8bacf..819d3d22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.10,<4 +ctranslate2>=3.17,<4 huggingface_hub>=0.13 tokenizers==0.13.* onnxruntime>=1.14,<2 diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 6ecf2c49..ca8d5a9d 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -34,6 +34,24 @@ def test_transcribe(jfk_path): assert segment.end == segment.words[-1].end +def test_prefix_with_timestamps(jfk_path): + model = WhisperModel("tiny") + segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans") + segments = list(segments) + + assert len(segments) == 1 + + segment = segments[0] + + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert segment.start == 0 + assert 10 < segment.end < 11 + + def test_vad(jfk_path): model = WhisperModel("tiny") segments, info = model.transcribe(