"""Generated message classes for gkerecommender version v1.

GKE Recommender API
"""
# NOTE: This file is autogenerated and should not be edited by hand.

from __future__ import absolute_import

from apitools.base.protorpclite import messages as _messages
from apitools.base.py import encoding


package = 'gkerecommender'


class Amount(_messages.Message):
  r"""Represents an amount of money in a specific currency.

  Fields:
    nanos: Output only. Number of nano (10^-9) units of the amount. The value
      must be between -999,999,999 and +999,999,999 inclusive. If `units` is
      positive, `nanos` must be positive or zero. If `units` is zero, `nanos`
      can be positive, zero, or negative. If `units` is negative, `nanos` must
      be negative or zero. For example $-1.75 is represented as `units`=-1 and
      `nanos`=-750,000,000.
    units: Output only. The whole units of the amount. For example if
      `currencyCode` is `"USD"`, then 1 unit is one US dollar.
  """

  nanos = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  units = _messages.IntegerField(2)


class Cost(_messages.Message):
  r"""Cost for running a model deployment on a given instance type. Currently,
  only USD currency code is supported.

  Fields:
    costPerMillionInputTokens: Optional. The cost per million input tokens.
      $/input token = ($/output token) / output-to-input-cost-ratio.
    costPerMillionOutputTokens: Optional. The cost per million output tokens,
      calculated as: $/output token = GPU $/s / (1/output-to-input-cost-ratio
      * input tokens/s + output tokens/s)
    outputInputCostRatio: Optional. The output-to-input cost ratio. This
      determines how the total GPU cost is split between input and output
      tokens. If not provided, `4.0` is used, assuming a 4:1 output:input cost
      ratio.
    pricingModel: Optional. The pricing model used to calculate the cost. Can
      be one of: `3-years-cud`, `1-year-cud`, `on-demand`, `spot`. If not
      provided, `spot` will be used.
  """

  costPerMillionInputTokens = _messages.MessageField('Amount', 1)
  costPerMillionOutputTokens = _messages.MessageField('Amount', 2)
  outputInputCostRatio = _messages.FloatField(3, variant=_messages.Variant.FLOAT)
  pricingModel = _messages.StringField(4)


class FetchBenchmarkingDataRequest(_messages.Message):
  r"""Request message for GkeInferenceQuickstart.FetchBenchmarkingData.

  Fields:
    instanceType: Optional. The instance type to filter benchmarking data.
      Instance types are in the format `a2-highgpu-1g`. If not provided, all
      instance types for the given profile's `model_server_info` will be
      returned. Use GkeInferenceQuickstart.FetchProfiles to find available
      instance types.
    modelServerInfo: Required. The model server configuration to get
      benchmarking data for. Use GkeInferenceQuickstart.FetchProfiles to find
      valid configurations.
    pricingModel: Optional. The pricing model to use for the benchmarking
      data. Defaults to `spot`.
    servingStack: Optional. The serving stack to filter benchmarking data by,
      e.g. `llm-d/0.3`. If not provided, benchmarking data for all serving
      stacks that support the given model and model server will be returned.
    useCase: Optional. The use case to filter benchmarking data by. If not
      provided, all benchmarking data for the given profile's
      `model_server_info` will be returned.
  """

  instanceType = _messages.StringField(1)
  modelServerInfo = _messages.MessageField('ModelServerInfo', 2)
  pricingModel = _messages.StringField(3)
  servingStack = _messages.MessageField('ServingStack', 4)
  useCase = _messages.StringField(5)


class FetchBenchmarkingDataResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchBenchmarkingData.

  Fields:
    profile: Output only. List of profiles containing their respective
      benchmarking data.
  """

  profile = _messages.MessageField('Profile', 1, repeated=True)


class FetchModelServerVersionsResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchModelServerVersions.

  Fields:
    modelServerVersions: Output only. A list of available model server
      versions.
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchModelServerVersionsResponse` call to retrieve the next
      page of results. If this field is omitted or empty, then there are no
      more results to return.
  """

  modelServerVersions = _messages.StringField(1, repeated=True)
  nextPageToken = _messages.StringField(2)


class FetchModelServersResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchModelServers.

  Fields:
    modelServers: Output only. List of available model servers. Open-source
      model servers use simplified, lowercase names (e.g., `vllm`).
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchModelServersResponse` call to retrieve the next page of
      results. If this field is omitted or empty, then there are no more
      results to return.
  """

  modelServers = _messages.StringField(1, repeated=True)
  nextPageToken = _messages.StringField(2)


class FetchModelsResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchModels.

  Fields:
    models: Output only. List of available models. Open-source models follow
      the Huggingface Hub `owner/model_name` format.
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchModelsResponse` call to retrieve the next page of
      results. If this field is omitted or empty, then there are no more
      results to return.
  """

  models = _messages.StringField(1, repeated=True)
  nextPageToken = _messages.StringField(2)


class FetchProfilesRequest(_messages.Message):
  r"""Request message for GkeInferenceQuickstart.FetchProfiles.

  Fields:
    model: Optional. The model to filter profiles by. Open-source models
      follow the Huggingface Hub `owner/model_name` format. If not provided,
      all models are returned. Use GkeInferenceQuickstart.FetchModels to find
      available models.
    modelServer: Optional. The model server to filter profiles by. If not
      provided, all model servers are returned. Use
      GkeInferenceQuickstart.FetchModelServers to find available model servers
      for a given model.
    modelServerVersion: Optional. The model server version to filter profiles
      by. If not provided, all model server versions are returned. Use
      GkeInferenceQuickstart.FetchModelServerVersions to find available
      versions for a given model and server.
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchProfilesRequest` call. Provide this to retrieve the subsequent
      page in a multi-page list of results. When paginating, all other
      parameters provided to `FetchProfilesRequest` must match the call that
      provided the page token.
    performanceRequirements: Optional. The performance requirements to filter
      profiles. Profiles that do not meet these requirements are filtered out.
      If not provided, all profiles are returned.
    servingStack: Optional. The serving stack to filter profiles by. If not
      provided, profiles for all serving stacks that support the given model
      and model server will be returned.
    workloadSpec: Optional. The workload specification to filter profiles by.
      If not provided, all use cases are returned.
  """

  model = _messages.StringField(1)
  modelServer = _messages.StringField(2)
  modelServerVersion = _messages.StringField(3)
  pageSize = _messages.IntegerField(4, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(5)
  performanceRequirements = _messages.MessageField('PerformanceRequirements', 6)
  servingStack = _messages.MessageField('ServingStack', 7)
  workloadSpec = _messages.MessageField('WorkloadSpec', 8)


class FetchProfilesResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchProfiles.

  Fields:
    comments: Output only. Additional comments related to the response.
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchProfilesResponse` call to retrieve the next page of
      results. If this field is omitted or empty, then there are no more
      results to return.
    performanceRange: Output only. The combined range of performance values
      observed across all profiles in this response.
    profile: Output only. List of profiles that match the given model server
      info and performance requirements (if provided).
  """

  comments = _messages.StringField(1)
  nextPageToken = _messages.StringField(2)
  performanceRange = _messages.MessageField('PerformanceRange', 3)
  profile = _messages.MessageField('Profile', 4, repeated=True)


class FetchServingStackVersionsResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchServingStackVersions.

  Fields:
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchServingStackVersionsResponse` call to retrieve the next
      page of results. If this field is omitted or empty, then there are no
      more results to return.
    servingStackVersions: Output only. A list of available serving stack
      versions.
  """

  nextPageToken = _messages.StringField(1)
  servingStackVersions = _messages.StringField(2, repeated=True)


class FetchServingStacksResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchServingStacks.

  Fields:
    nextPageToken: Output only. A token which may be sent as page_token in a
      subsequent `FetchServingStacksResponse` call to retrieve the next page
      of results. If this field is omitted or empty, then there are no more
      results to return.
    servingStacks: Output only. List of available serving stacks.
  """

  nextPageToken = _messages.StringField(1)
  servingStacks = _messages.MessageField('ServingStack', 2, repeated=True)


class FetchUseCasesRequest(_messages.Message):
  r"""Request message for GkeInferenceQuickstart.FetchUseCases."""


class FetchUseCasesResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.FetchUseCases.

  Fields:
    workloadSpecs: Output only. The workload specifications supported by the
      service.
  """

  workloadSpecs = _messages.MessageField('WorkloadSpec', 1, repeated=True)


class GenerateOptimizedManifestRequest(_messages.Message):
  r"""Request message for GkeInferenceQuickstart.GenerateOptimizedManifest.

  Fields:
    acceleratorType: Required. The accelerator type. Use
      GkeInferenceQuickstart.FetchProfiles to find valid accelerators for a
      given `model_server_info`.
    kubernetesNamespace: Optional. The kubernetes namespace to deploy the
      manifests in.
    modelServerInfo: Required. The model server configuration to generate the
      manifest for. Use GkeInferenceQuickstart.FetchProfiles to find valid
      configurations.
    performanceRequirements: Optional. The performance requirements to use for
      generating Horizontal Pod Autoscaler (HPA) resources. If provided, the
      manifest includes HPA resources to adjust the model server replica count
      to maintain the specified targets (e.g., NTPOT, TTFT) at a P50 latency.
      Cost targets are not currently supported for HPA generation. If the
      specified targets are not achievable, the HPA manifest will not be
      generated.
    servingStack: Optional. The serving stack to use for generating the
      manifest. If not provided, the latest serving stack that supports the
      given model and model server will be used.
    storageConfig: Optional. The storage configuration for the model. If not
      provided, the model is loaded from Huggingface.
    useCase: Optional. The use case of the workload. Can be one of: `advanced
      costumer support`, `code completion, `text summarization`, `chatbot`,
      `text generation`, `deep research`. If not provided, `chatbot` is used.
  """

  acceleratorType = _messages.StringField(1)
  kubernetesNamespace = _messages.StringField(2)
  modelServerInfo = _messages.MessageField('ModelServerInfo', 3)
  performanceRequirements = _messages.MessageField('PerformanceRequirements', 4)
  servingStack = _messages.MessageField('ServingStack', 5)
  storageConfig = _messages.MessageField('StorageConfig', 6)
  useCase = _messages.StringField(7)


class GenerateOptimizedManifestResponse(_messages.Message):
  r"""Response message for GkeInferenceQuickstart.GenerateOptimizedManifest.

  Fields:
    comments: Output only. Comments related to deploying the generated
      manifests.
    kubernetesManifests: Output only. A list of generated Kubernetes
      manifests.
    manifestVersion: Output only. Additional information about the versioned
      dependencies used to generate the manifests. See [Run best practice
      inference with GKE Inference Quickstart
      recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-
      learning/inference/inference-quickstart) for details.
  """

  comments = _messages.StringField(1, repeated=True)
  kubernetesManifests = _messages.MessageField('KubernetesManifest', 2, repeated=True)
  manifestVersion = _messages.StringField(3)


class GkerecommenderModelServerVersionsFetchRequest(_messages.Message):
  r"""A GkerecommenderModelServerVersionsFetchRequest object.

  Fields:
    model: Required. The model for which to list model server versions. Open-
      source models follow the Huggingface Hub `owner/model_name` format. Use
      GkeInferenceQuickstart.FetchModels to find available models.
    modelServer: Required. The model server for which to list versions. Open-
      source model servers use simplified, lowercase names (e.g., `vllm`). Use
      GkeInferenceQuickstart.FetchModelServers to find available model
      servers.
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchModelServerVersionsRequest` call. Provide this to retrieve the
      subsequent page in a multi-page list of results. When paginating, all
      other parameters provided to `FetchModelServerVersionsRequest` must
      match the call that provided the page token.
  """

  model = _messages.StringField(1)
  modelServer = _messages.StringField(2)
  pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(4)


class GkerecommenderModelServersFetchRequest(_messages.Message):
  r"""A GkerecommenderModelServersFetchRequest object.

  Fields:
    model: Required. The model for which to list model servers. Open-source
      models follow the Huggingface Hub `owner/model_name` format. Use
      GkeInferenceQuickstart.FetchModels to find available models.
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchModelServersRequest` call. Provide this to retrieve the subsequent
      page in a multi-page list of results. When paginating, all other
      parameters provided to `FetchModelServersRequest` must match the call
      that provided the page token.
  """

  model = _messages.StringField(1)
  pageSize = _messages.IntegerField(2, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(3)


class GkerecommenderModelsFetchRequest(_messages.Message):
  r"""A GkerecommenderModelsFetchRequest object.

  Fields:
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchModelsRequest` call. Provide this to retrieve the subsequent page
      in a multi-page list of results. When paginating, all other parameters
      provided to `FetchModelsRequest` must match the call that provided the
      page token.
  """

  pageSize = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(2)


class GkerecommenderServingStackVersionsFetchRequest(_messages.Message):
  r"""A GkerecommenderServingStackVersionsFetchRequest object.

  Fields:
    model: Optional. The model to filter serving stack versions by.
    modelServer: Optional. The model server to filter serving stack versions
      by.
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchServingStackVersionsRequest` call. Provide this to retrieve the
      subsequent page in a multi-page list of results. When paginating, all
      other parameters provided to `FetchServingStackVersionsRequest` must
      match the call that provided the page token.
    servingStack: Required. The serving stack to list versions for.
  """

  model = _messages.StringField(1)
  modelServer = _messages.StringField(2)
  pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(4)
  servingStack = _messages.StringField(5)


class GkerecommenderServingStacksFetchRequest(_messages.Message):
  r"""A GkerecommenderServingStacksFetchRequest object.

  Fields:
    model: Optional. The model for which to list serving stacks. Open-source
      models follow the Huggingface Hub `owner/model_name` format. Use
      GkeInferenceQuickstart.FetchModels to find available models.
    modelServer: Optional. The model server for which to list serving stacks.
      Open-source model servers use simplified, lowercase names (e.g.,
      `vllm`). Use GkeInferenceQuickstart.FetchModelServers to find available
      model servers.
    pageSize: Optional. The target number of results to return in a single
      response. If not specified, a default value will be chosen by the
      service. Note that the response may include a partial list and a caller
      should only rely on the response's next_page_token to determine if there
      are more instances left to be queried.
    pageToken: Optional. The value of next_page_token received from a previous
      `FetchServingStacksRequest` call. Provide this to retrieve the
      subsequent page in a multi-page list of results. When paginating, all
      other parameters provided to `FetchServingStacksRequest` must match the
      call that provided the page token.
  """

  model = _messages.StringField(1)
  modelServer = _messages.StringField(2)
  pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  pageToken = _messages.StringField(4)


class KubernetesManifest(_messages.Message):
  r"""A Kubernetes manifest.

  Fields:
    apiVersion: Output only. Kubernetes API version.
    content: Output only. YAML content.
    kind: Output only. Kubernetes resource kind.
  """

  apiVersion = _messages.StringField(1)
  content = _messages.StringField(2)
  kind = _messages.StringField(3)


class MillisecondRange(_messages.Message):
  r"""Represents a range of latency values in milliseconds.

  Fields:
    max: Output only. The maximum value of the range.
    min: Output only. The minimum value of the range.
  """

  max = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  min = _messages.IntegerField(2, variant=_messages.Variant.INT32)


class ModelInfo(_messages.Message):
  r"""Model information for a model deployment.

  Fields:
    attentionHeadsCount: Output only. The number of attention heads in the
      model.
    headDimensions: Output only. The number of dimensions in the model's
      hidden layers.
    hiddenLayersCount: Output only. The number of hidden layers in the model.
    kvCacheSizePerToken: Output only. The size of the key value cache per
      token.
    kvHeadsCount: Output only. The number of key value heads in the model.
    maxContextLength: Output only. The maximum context length of the model.
    modelSizeGb: Output only. The size of the model in gigabytes.
    parametersBillionsCount: Output only. The number of parameters in
      billions.
  """

  attentionHeadsCount = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  headDimensions = _messages.IntegerField(2, variant=_messages.Variant.INT32)
  hiddenLayersCount = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  kvCacheSizePerToken = _messages.IntegerField(4, variant=_messages.Variant.INT32)
  kvHeadsCount = _messages.IntegerField(5, variant=_messages.Variant.INT32)
  maxContextLength = _messages.IntegerField(6, variant=_messages.Variant.INT32)
  modelSizeGb = _messages.FloatField(7, variant=_messages.Variant.FLOAT)
  parametersBillionsCount = _messages.FloatField(8, variant=_messages.Variant.FLOAT)


class ModelServerInfo(_messages.Message):
  r"""Model server information gives. Valid model server info combinations can
  be found using GkeInferenceQuickstart.FetchProfiles.

  Fields:
    model: Required. The model. Open-source models follow the Huggingface Hub
      `owner/model_name` format. Use GkeInferenceQuickstart.FetchModels to
      find available models.
    modelServer: Required. The model server. Open-source model servers use
      simplified, lowercase names (e.g., `vllm`). Use
      GkeInferenceQuickstart.FetchModelServers to find available servers.
    modelServerVersion: Optional. The model server version. Use
      GkeInferenceQuickstart.FetchModelServerVersions to find available
      versions. If not provided, the latest available version is used.
  """

  model = _messages.StringField(1)
  modelServer = _messages.StringField(2)
  modelServerVersion = _messages.StringField(3)


class PerformanceRange(_messages.Message):
  r"""Performance range for a model deployment.

  Fields:
    itlRange: Output only. The range of inter-token latency (ITL) in
      milliseconds. ITL is the latency between consecutive tokens being
      generated.
    ntpotRange: Output only. The range of NTPOT (Normalized Time Per Output
      Token) in milliseconds. NTPOT is the request latency normalized by the
      number of output tokens, measured as request_latency /
      total_output_tokens.
    throughputInputRange: Output only. The range of input tokens per second.
      This is measured as the number of input tokens processed by the server
      divided by the elapsed time in seconds.
    throughputOutputRange: Output only. The range of throughput in output
      tokens per second. This is measured as
      total_output_tokens_generated_by_server / elapsed_time_in_seconds.
    throughputRange: Output only. The range of total tokens per second. This
      is measured as the sum of input and output tokens processed by the
      server divided by the elapsed time in seconds.
    ttftRange: Output only. The range of TTFT (Time To First Token) in
      milliseconds. TTFT is the time it takes to generate the first token for
      a request.
  """

  itlRange = _messages.MessageField('MillisecondRange', 1)
  ntpotRange = _messages.MessageField('MillisecondRange', 2)
  throughputInputRange = _messages.MessageField('TokensPerSecondRange', 3)
  throughputOutputRange = _messages.MessageField('TokensPerSecondRange', 4)
  throughputRange = _messages.MessageField('TokensPerSecondRange', 5)
  ttftRange = _messages.MessageField('MillisecondRange', 6)


class PerformanceRequirements(_messages.Message):
  r"""Performance requirements for a profile and or model deployment.

  Fields:
    targetCost: Optional. The target cost for running a profile's model
      server. If not provided, this requirement will not be enforced.
    targetItlMilliseconds: Optional. The target inter-token latency (ITL) in
      milliseconds. ITL is the latency between consecutive tokens being
      generated. If not provided, this target will not be enforced.
    targetNtpotMilliseconds: Optional. The target Normalized Time Per Output
      Token (NTPOT) in milliseconds. NTPOT is calculated as `request_latency /
      total_output_tokens`. If not provided, this target will not be enforced.
    targetTtftMilliseconds: Optional. The target Time To First Token (TTFT) in
      milliseconds. TTFT is the time it takes to generate the first token for
      a request. If not provided, this target will not be enforced.
  """

  targetCost = _messages.MessageField('Cost', 1)
  targetItlMilliseconds = _messages.IntegerField(2, variant=_messages.Variant.INT32)
  targetNtpotMilliseconds = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  targetTtftMilliseconds = _messages.IntegerField(4, variant=_messages.Variant.INT32)


class PerformanceStats(_messages.Message):
  r"""Performance statistics for a model deployment.

  Fields:
    cost: Output only. The cost of running the model deployment.
    inputTokensPerSecond: Output only. The input tokens per second. This is
      the throughput measured as the number of input tokens processed by the
      server divided by the elapsed time in seconds.
    itlMilliseconds: Output only. The inter-token latency (ITL) in
      milliseconds. This is the latency between consecutive tokens being
      generated.
    ntpotMilliseconds: Output only. The Normalized Time Per Output Token
      (NTPOT) in milliseconds. This is the request latency normalized by the
      number of output tokens, measured as request_latency /
      total_output_tokens.
    outputTokensPerSecond: Output only. The number of output tokens per
      second. This is the throughput measured as
      total_output_tokens_generated_by_server / elapsed_time_in_seconds.
    queriesPerSecond: Output only. The number of queries per second. Note:
      This metric can vary widely based on context length and may not be a
      reliable measure of LLM throughput.
    totalTokensPerSecond: Output only. The total tokens per second. This is
      the throughput measured as the sum of input and output tokens processed
      by the server divided by the elapsed time in seconds.
    ttftMilliseconds: Output only. The Time To First Token (TTFT) in
      milliseconds. This is the time it takes to generate the first token for
      a request.
  """

  cost = _messages.MessageField('Cost', 1, repeated=True)
  inputTokensPerSecond = _messages.IntegerField(2, variant=_messages.Variant.INT32)
  itlMilliseconds = _messages.IntegerField(3, variant=_messages.Variant.INT32)
  ntpotMilliseconds = _messages.IntegerField(4, variant=_messages.Variant.INT32)
  outputTokensPerSecond = _messages.IntegerField(5, variant=_messages.Variant.INT32)
  queriesPerSecond = _messages.FloatField(6, variant=_messages.Variant.FLOAT)
  totalTokensPerSecond = _messages.IntegerField(7, variant=_messages.Variant.INT32)
  ttftMilliseconds = _messages.IntegerField(8, variant=_messages.Variant.INT32)


class Profile(_messages.Message):
  r"""A profile containing information about a model deployment.

  Fields:
    acceleratorType: Output only. The accelerator type. Expected format:
      `nvidia-h100-80gb`.
    instanceType: Output only. The instance type. Expected format:
      `a2-highgpu-1g`.
    modelInfo: Output only. The model information of the model in this
      profile.
    modelServerInfo: Output only. The model server configuration. Use
      GkeInferenceQuickstart.FetchProfiles to find valid configurations.
    performanceStats: Output only. The performance statistics for this
      profile.
    resourcesUsed: Output only. The resources used by the model deployment.
    servingStack: Output only. The serving stack used for this profile.
    tpuTopology: Output only. The TPU topology (if applicable).
    workloadSpec: Output only. The workload specification.
  """

  acceleratorType = _messages.StringField(1)
  instanceType = _messages.StringField(2)
  modelInfo = _messages.MessageField('ModelInfo', 3)
  modelServerInfo = _messages.MessageField('ModelServerInfo', 4)
  performanceStats = _messages.MessageField('PerformanceStats', 5, repeated=True)
  resourcesUsed = _messages.MessageField('ResourcesUsed', 6)
  servingStack = _messages.MessageField('ServingStack', 7)
  tpuTopology = _messages.StringField(8)
  workloadSpec = _messages.MessageField('WorkloadSpec', 9)


class ResourcesUsed(_messages.Message):
  r"""Resources used by a model deployment.

  Fields:
    acceleratorCount: Output only. The number of accelerators (e.g., GPUs or
      TPUs) used by the model deployment on the Kubernetes node.
  """

  acceleratorCount = _messages.IntegerField(1, variant=_messages.Variant.INT32)


class ServingStack(_messages.Message):
  r"""Serving stack information.

  Fields:
    name: Required. The name of the serving stack.
    version: Optional. The version of the serving stack.
  """

  name = _messages.StringField(1)
  version = _messages.StringField(2)


class StandardQueryParameters(_messages.Message):
  r"""Query parameters accepted by all methods.

  Enums:
    FXgafvValueValuesEnum: V1 error format.
    AltValueValuesEnum: Data format for response.

  Fields:
    f__xgafv: V1 error format.
    access_token: OAuth access token.
    alt: Data format for response.
    callback: JSONP
    fields: Selector specifying which fields to include in a partial response.
    key: API key. Your API key identifies your project and provides you with
      API access, quota, and reports. Required unless you provide an OAuth 2.0
      token.
    oauth_token: OAuth 2.0 token for the current user.
    prettyPrint: Returns response with indentations and line breaks.
    quotaUser: Available to use for quota purposes for server-side
      applications. Can be any arbitrary string assigned to a user, but should
      not exceed 40 characters.
    trace: A tracing token of the form "token:<tokenid>" to include in api
      requests.
    uploadType: Legacy upload protocol for media (e.g. "media", "multipart").
    upload_protocol: Upload protocol for media (e.g. "raw", "multipart").
  """

  class AltValueValuesEnum(_messages.Enum):
    r"""Data format for response.

    Values:
      json: Responses with Content-Type of application/json
      media: Media download with context-dependent Content-Type
      proto: Responses with Content-Type of application/x-protobuf
    """
    json = 0
    media = 1
    proto = 2

  class FXgafvValueValuesEnum(_messages.Enum):
    r"""V1 error format.

    Values:
      _1: v1 error format
      _2: v2 error format
    """
    _1 = 0
    _2 = 1

  f__xgafv = _messages.EnumField('FXgafvValueValuesEnum', 1)
  access_token = _messages.StringField(2)
  alt = _messages.EnumField('AltValueValuesEnum', 3, default='json')
  callback = _messages.StringField(4)
  fields = _messages.StringField(5)
  key = _messages.StringField(6)
  oauth_token = _messages.StringField(7)
  prettyPrint = _messages.BooleanField(8, default=True)
  quotaUser = _messages.StringField(9)
  trace = _messages.StringField(10)
  uploadType = _messages.StringField(11)
  upload_protocol = _messages.StringField(12)


class StorageConfig(_messages.Message):
  r"""Storage configuration for a model deployment.

  Fields:
    modelBucketUri: Optional. The Google Cloud Storage bucket URI to load the
      model from. This URI must point to the directory containing the model's
      config file (`config.json`) and model weights. A tuned GCSFuse setup can
      improve LLM Pod startup time by more than 7x. Expected format: `gs:///`.
    xlaCacheBucketUri: Optional. The URI for the GCS bucket containing the XLA
      compilation cache. If using TPUs, the XLA cache will be written to the
      same path as `model_bucket_uri`. This can speed up vLLM model
      preparation for repeated deployments.
  """

  modelBucketUri = _messages.StringField(1)
  xlaCacheBucketUri = _messages.StringField(2)


class TokensPerSecondRange(_messages.Message):
  r"""Represents a range of throughput values in tokens per second.

  Fields:
    max: Output only. The maximum value of the range.
    min: Output only. The minimum value of the range.
  """

  max = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  min = _messages.IntegerField(2, variant=_messages.Variant.INT32)


class WorkloadSpec(_messages.Message):
  r"""Workload specification for a workload.

  Fields:
    averageInputLength: Optional. The average input length of the workload.
      Only works alongside average output length.
    averageOutputLength: Optional. The average output length of the workload.
      Only works alongside average input length.
    useCase: Optional. The use case of the workload. Can be one of: `advanced
      costumer support`, `code completion, `text summarization`, `chatbot`,
      `text generation`, `deep research`.
  """

  averageInputLength = _messages.IntegerField(1, variant=_messages.Variant.INT32)
  averageOutputLength = _messages.IntegerField(2, variant=_messages.Variant.INT32)
  useCase = _messages.StringField(3)


encoding.AddCustomJsonFieldMapping(
    StandardQueryParameters, 'f__xgafv', '$.xgafv')
encoding.AddCustomJsonEnumMapping(
    StandardQueryParameters.FXgafvValueValuesEnum, '_1', '1')
encoding.AddCustomJsonEnumMapping(
    StandardQueryParameters.FXgafvValueValuesEnum, '_2', '2')