GoogleCloudPlatform · copybara-service · Oct 18, 2024 · Oct 18, 2024
@@ -482,6 +482,8 @@
  "_region = REGION\n",
  "REGION = TPU_DEPLOYMENT_REGION\n",
  "\n",
+ "if use_dedicated_endpoint:\n",
+ " DEDICATED_ENDPOINT_DNS = endpoints[\"hexllm_tpu\"].gca_resource.dedicated_endpoint_dns\n",
  "ENDPOINT_RESOURCE_NAME = \"projects/{}/locations/{}/endpoints/{}\".format(\n",
  " PROJECT_ID, REGION, endpoints[\"hexllm_tpu\"].name\n",
  ")\n",
@@ -513,6 +515,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",
@@ -624,6 +632,9 @@
  "# Note that a larger max_model_len will require more GPU memory.\n",
  "max_model_len = 2048\n",
  "\n",
+ "# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
+ "use_dedicated_endpoint = True # @param {type:\"boolean\"}\n",
+ "\n",
  "\n",
  "def deploy_model_vllm(\n",
  " model_name: str,\n",
@@ -823,6 +834,8 @@
  "\n",
  "# @markdown You can build chat applications with the instruction-tuned Gemma models.\n",
  "\n",
+ "if use_dedicated_endpoint:\n",
+ " DEDICATED_ENDPOINT_DNS = endpoints[\"vllm_gpu\"].gca_resource.dedicated_endpoint_dns\n",
  "ENDPOINT_RESOURCE_NAME = \"projects/{}/locations/{}/endpoints/{}\".format(\n",
  " PROJECT_ID, REGION, endpoints[\"vllm_gpu\"].name\n",
  ")\n",
@@ -854,6 +867,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",

@@ -535,6 +535,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",
@@ -834,6 +840,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",

@@ -523,6 +523,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",
@@ -759,6 +765,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",
@@ -815,6 +827,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",

@@ -530,6 +530,12 @@
  "BASE_URL = (\n",
  " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
  ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
  "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
  "\n",
  "model_response = client.chat.completions.create(\n",