Fix cache max_seq_len (#568)

* fix max_seq_len * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * another one * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix max new tokens --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
fishaudio · Sep 19, 2024 · ad55185 · ad55185
1 parent 711209e
commit ad55185
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 7 deletions.
diff --git a/install_env.bat b/install_env.bat
@@ -144,6 +144,7 @@ call :download_and_install "triton_windows-0.1.0-py3-none-any.whl" ^
 
 endlocal
 echo "Environment Check: Success."
+:end
 pause
 
 goto :EOF

diff --git a/tools/api.py b/tools/api.py
@@ -220,7 +220,7 @@ def inference(req: ServeTTSRequest):
         compile=args.compile,
         iterative_prompt=req.chunk_length > 0,
         chunk_length=req.chunk_length,
-        max_length=2048,
+        max_length=4096,
         prompt_tokens=prompt_tokens,
         prompt_text=prompt_texts,
     )
@@ -424,7 +424,7 @@ async def data(self) -> Annotated[Any, ContentType("application/msgpack")]:
                 text="Hello world.",
                 references=[],
                 reference_id=None,
-                max_new_tokens=1024,
+                max_new_tokens=0,
                 chunk_length=200,
                 top_p=0.7,
                 repetition_penalty=1.2,

diff --git a/tools/llama/generate.py b/tools/llama/generate.py
@@ -237,6 +237,16 @@ def generate(
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(1)
 
+    if max_new_tokens:
+        if T + max_new_tokens > model.config.max_seq_len:
+            max_new_tokens = model.config.max_seq_len - T
+            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
+
+        T_new = T + max_new_tokens
+    else:
+        T_new = model.config.max_seq_len
+        max_new_tokens = T_new - T
+
     device, dtype = prompt.device, prompt.dtype
 
     codebook_dim = 1 + model.config.num_codebooks
@@ -565,7 +575,9 @@ def worker():
         )
         with torch.device(device):
             model.setup_caches(
-                max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype
+                max_batch_size=1,
+                max_seq_len=model.config.max_seq_len,
+                dtype=next(model.parameters()).dtype,
             )
         init_event.set()
 
@@ -607,7 +619,7 @@ def worker():
     multiple=True,
 )
 @click.option("--num-samples", type=int, default=1)
-@click.option("--max-new-tokens", type=int, default=1024)
+@click.option("--max-new-tokens", type=int, default=0)
 @click.option("--top-p", type=float, default=0.7)
 @click.option("--repetition-penalty", type=float, default=1.2)
 @click.option("--temperature", type=float, default=0.7)
@@ -654,7 +666,9 @@ def main(
     )
     with torch.device(device):
         model.setup_caches(
-            max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype
+            max_batch_size=1,
+            max_seq_len=model.config.max_seq_len,
+            dtype=next(model.parameters()).dtype,
         )
     if torch.cuda.is_available():
         torch.cuda.synchronize()

diff --git a/tools/webui.py b/tools/webui.py
@@ -286,7 +286,7 @@ def build_app():
                             label=i18n("Maximum tokens per batch, 0 means no limit"),
                             minimum=0,
                             maximum=2048,
-                            value=1024,  # 0 means no limit
+                            value=0,  # 0 means no limit
                             step=8,
                         )
 
@@ -505,7 +505,7 @@ def parse_args():
             enable_reference_audio=False,
             reference_audio=None,
             reference_text="",
-            max_new_tokens=1024,
+            max_new_tokens=0,
             chunk_length=200,
             top_p=0.7,
             repetition_penalty=1.2,