instill-ai · tonywang10101 · Dec 22, 2023 · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023
@@ -26,10 +26,23 @@ message ExtraParamObject {
   string param_value = 2;
 }
 
-// Conversation based prompt for text generation model
-message ConversationObject {
-  // Role name of the conversation
-  string role = 1;
-  // Content of the conversation
+// Prompt Image for text generation model
+message PromptImage {
+  // Image could be either a url or base64 encoded string
+  oneof type {
+    // Image URL
+    string prompt_image_url = 1;
+    // Base64 encoded Image
+    string prompt_image_base64 = 2;
+  }
+}
+
+// Content used for chat history in text generation model
+message Content {
+  // Type of Content
+  string type = 1;
+  // Content of Text Message
   string content = 2;
+  // Content of Image
+  PromptImage prompt_image = 3;
 }
@@ -4,7 +4,7 @@ package model.model.v1alpha;
 
 // Google api
 import "google/api/field_behavior.proto";
-import "model/model/v1alpha/common.proto";
+import "google/protobuf/struct.proto";
 
 // ImageToImageInput represents the input of image to image task
 message ImageToImageInput {
@@ -26,7 +26,7 @@ message ImageToImageInput {
   // The number of generated samples, default is 1
   optional int32 samples = 7 [(google.api.field_behavior) = OPTIONAL];
   // The extra parameters
-  repeated ExtraParamObject extra_params = 8 [(google.api.field_behavior) = OPTIONAL];
+  google.protobuf.Struct extra_params = 9 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // ImageToImageOutput represents the output of image to image task

@@ -4,22 +4,29 @@ package model.model.v1alpha;
 
 // Google api
 import "google/api/field_behavior.proto";
+import "google/protobuf/struct.proto";
 import "model/model/v1alpha/common.proto";
 
 // TextGenerationInput represents the input of text generation task
 message TextGenerationInput {
   // The prompt text
   string prompt = 1 [(google.api.field_behavior) = REQUIRED];
+  // The prompt images
+  repeated PromptImage prompt_images = 2 [(google.api.field_behavior) = OPTIONAL];
+  // The chat history
+  repeated Content chat_history = 3 [(google.api.field_behavior) = OPTIONAL];
+  // The system message
+  optional string system_message = 4 [(google.api.field_behavior) = OPTIONAL];
   // The maximum number of tokens for model to generate
-  optional int32 max_new_tokens = 2 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 max_new_tokens = 5 [(google.api.field_behavior) = OPTIONAL];
   // The temperature for sampling
-  optional float temperature = 3 [(google.api.field_behavior) = OPTIONAL];
+  optional float temperature = 6 [(google.api.field_behavior) = OPTIONAL];
   // Top k for sampling
-  optional int32 top_k = 4 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 top_k = 7 [(google.api.field_behavior) = OPTIONAL];
   // The seed
-  optional int32 seed = 5 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 seed = 8 [(google.api.field_behavior) = OPTIONAL];
   // The extra parameters
-  repeated ExtraParamObject extra_params = 6 [(google.api.field_behavior) = OPTIONAL];
+  google.protobuf.Struct extra_params = 9 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // TextGenerationOutput represents the output of text generation task

@@ -4,22 +4,29 @@ package model.model.v1alpha;
 
 // Google api
 import "google/api/field_behavior.proto";
+import "google/protobuf/struct.proto";
 import "model/model/v1alpha/common.proto";
 
 // TextGenerationChatInput represents the input of text generation chat task
 message TextGenerationChatInput {
   // The prompt text
-  repeated ConversationObject conversation = 1 [(google.api.field_behavior) = REQUIRED];
+  string prompt = 1 [(google.api.field_behavior) = REQUIRED];
+  // The prompt images
+  repeated PromptImage prompt_images = 2 [(google.api.field_behavior) = OPTIONAL];
+  // The chat history
+  repeated Content chat_history = 3 [(google.api.field_behavior) = OPTIONAL];
+  // The system message
+  optional string system_message = 4 [(google.api.field_behavior) = OPTIONAL];
   // The maximum number of tokens for model to generate
-  optional int32 max_new_tokens = 2 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 max_new_tokens = 5 [(google.api.field_behavior) = OPTIONAL];
   // The temperature for sampling
-  optional float temperature = 3 [(google.api.field_behavior) = OPTIONAL];
+  optional float temperature = 6 [(google.api.field_behavior) = OPTIONAL];
   // Top k for sampling
-  optional int32 top_k = 4 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 top_k = 7 [(google.api.field_behavior) = OPTIONAL];
   // The seed
-  optional int32 seed = 5 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 seed = 8 [(google.api.field_behavior) = OPTIONAL];
   // The extra parameters
-  repeated ExtraParamObject extra_params = 6 [(google.api.field_behavior) = OPTIONAL];
+  google.protobuf.Struct extra_params = 9 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // TextGenerationChatOutput represents the output of text generation chat task

@@ -4,7 +4,7 @@ package model.model.v1alpha;
 
 // Google api
 import "google/api/field_behavior.proto";
-import "model/model/v1alpha/common.proto";
+import "google/protobuf/struct.proto";
 
 // TextToImageInput represents the input of text to image task
 message TextToImageInput {
@@ -26,7 +26,7 @@ message TextToImageInput {
   // The number of generated samples, default is 1
   optional int32 samples = 7 [(google.api.field_behavior) = OPTIONAL];
   // The extra parameters
-  repeated ExtraParamObject extra_params = 8 [(google.api.field_behavior) = OPTIONAL];
+  google.protobuf.Struct extra_params = 9 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // TextToImageOutput represents the output of text to image task

@@ -4,29 +4,29 @@ package model.model.v1alpha;
 
 // Google api
 import "google/api/field_behavior.proto";
+import "google/protobuf/struct.proto";
 import "model/model/v1alpha/common.proto";
 
 // VisualQuestionAnsweringInput represents the input of visaul question answering task
 message VisualQuestionAnsweringInput {
   // The prompt text
   string prompt = 1 [(google.api.field_behavior) = REQUIRED];
-  // The Prompt Image, only for multimodal input
-  oneof type {
-    // Image type URL
-    string prompt_image_url = 2;
-    // Image type base64
-    string prompt_image_base64 = 3;
-  }
+  // The prompt images
+  repeated PromptImage prompt_images = 2 [(google.api.field_behavior) = OPTIONAL];
+  // The chat history
+  repeated Content chat_history = 3 [(google.api.field_behavior) = OPTIONAL];
+  // The system message
+  optional string system_message = 4 [(google.api.field_behavior) = OPTIONAL];
   // The maximum number of tokens for model to generate
-  optional int32 max_new_tokens = 4 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 max_new_tokens = 5 [(google.api.field_behavior) = OPTIONAL];
   // The temperature for sampling
-  optional float temperature = 5 [(google.api.field_behavior) = OPTIONAL];
+  optional float temperature = 6 [(google.api.field_behavior) = OPTIONAL];
   // Top k for sampling
-  optional int32 top_k = 6 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 top_k = 7 [(google.api.field_behavior) = OPTIONAL];
   // The seed
-  optional int32 seed = 7 [(google.api.field_behavior) = OPTIONAL];
+  optional int32 seed = 8 [(google.api.field_behavior) = OPTIONAL];
   // The extra parameters
-  repeated ExtraParamObject extra_params = 8 [(google.api.field_behavior) = OPTIONAL];
+  google.protobuf.Struct extra_params = 9 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // VisualQuestionAnsweringOutput represents the output of visaul question answering task

@@ -7134,16 +7134,19 @@ definitions:
         title: Classification score
         readOnly: true
     title: ClassificationOutput represents the output of classification task
-  v1alphaConversationObject:
+  v1alphaContent:
     type: object
     properties:
-      role:
+      type:
         type: string
-        title: Role name of the conversation
+        title: Type of Content
       content:
         type: string
-        title: Content of the conversation
-    title: Conversation based prompt for text generation model
+        title: Content of Text Message
+      prompt_image:
+        $ref: '#/definitions/v1alphaPromptImage'
+        title: Content of Image
+    title: Content used for chat history in text generation model
   v1alphaCreateUserModelBinaryFileUploadResponse:
     type: object
     properties:
@@ -7236,18 +7239,6 @@ definitions:
         title: A list of detection objects
         readOnly: true
     title: DetectionOutput represents the output of detection task
-  v1alphaExtraParamObject:
-    type: object
-    properties:
-      param_name:
-        type: string
-        title: Name of the hyperparameter
-      param_value:
-        type: string
-        title: Value of the hyperparameter
-    title: |-
-      Additional hyperparameters for model inferences
-      or other configuration not listsed in protobuf
   v1alphaGetModelDefinitionResponse:
     type: object
     properties:
@@ -7305,10 +7296,7 @@ definitions:
         format: int32
         title: The number of generated samples, default is 1
       extra_params:
-        type: array
-        items:
-          type: object
-          $ref: '#/definitions/v1alphaExtraParamObject'
+        type: object
         title: The extra parameters
     title: ImageToImageInput represents the input of image to image task
     required:
@@ -7783,6 +7771,16 @@ definitions:
         title: A list of OCR objects
         readOnly: true
     title: OcrOutput represents the output of ocr task
+  v1alphaPromptImage:
+    type: object
+    properties:
+      prompt_image_url:
+        type: string
+        title: Image URL
+      prompt_image_base64:
+        type: string
+        title: Base64 encoded Image
+    title: Prompt Image for text generation model
   v1alphaPublishUserModelResponse:
     type: object
     properties:
@@ -8072,12 +8070,24 @@ definitions:
   v1alphaTextGenerationChatInput:
     type: object
     properties:
-      conversation:
+      prompt:
+        type: string
+        title: The prompt text
+      prompt_images:
         type: array
         items:
           type: object
-          $ref: '#/definitions/v1alphaConversationObject'
-        title: The prompt text
+          $ref: '#/definitions/v1alphaPromptImage'
+        title: The prompt images
+      chat_history:
+        type: array
+        items:
+          type: object
+          $ref: '#/definitions/v1alphaContent'
+        title: The chat history
+      system_message:
+        type: string
+        title: The system message
       max_new_tokens:
         type: integer
         format: int32
@@ -8095,14 +8105,11 @@ definitions:
         format: int32
         title: The seed
       extra_params:
-        type: array
-        items:
-          type: object
-          $ref: '#/definitions/v1alphaExtraParamObject'
+        type: object
         title: The extra parameters
     title: TextGenerationChatInput represents the input of text generation chat task
     required:
-      - conversation
+      - prompt
   v1alphaTextGenerationChatOutput:
     type: object
     properties:
@@ -8117,6 +8124,21 @@ definitions:
       prompt:
         type: string
         title: The prompt text
+      prompt_images:
+        type: array
+        items:
+          type: object
+          $ref: '#/definitions/v1alphaPromptImage'
+        title: The prompt images
+      chat_history:
+        type: array
+        items:
+          type: object
+          $ref: '#/definitions/v1alphaContent'
+        title: The chat history
+      system_message:
+        type: string
+        title: The system message
       max_new_tokens:
         type: integer
         format: int32
@@ -8134,10 +8156,7 @@ definitions:
         format: int32
         title: The seed
       extra_params:
-        type: array
-        items:
-          type: object
-          $ref: '#/definitions/v1alphaExtraParamObject'
+        type: object
         title: The extra parameters
     title: TextGenerationInput represents the input of text generation task
     required:
@@ -8179,10 +8198,7 @@ definitions:
         format: int32
         title: The number of generated samples, default is 1
       extra_params:
-        type: array
-        items:
-          type: object
-          $ref: '#/definitions/v1alphaExtraParamObject'
+        type: object
         title: The extra parameters
     title: TextToImageInput represents the input of text to image task
     required:
@@ -8285,12 +8301,21 @@ definitions:
       prompt:
         type: string
         title: The prompt text
-      prompt_image_url:
-        type: string
-        title: Image type URL
-      prompt_image_base64:
+      prompt_images:
+        type: array
+        items:
+          type: object
+          $ref: '#/definitions/v1alphaPromptImage'
+        title: The prompt images
+      chat_history:
+        type: array
+        items:
+          type: object
+          $ref: '#/definitions/v1alphaContent'
+        title: The chat history
+      system_message:
         type: string
-        title: Image type base64
+        title: The system message
       max_new_tokens:
         type: integer
         format: int32
@@ -8308,10 +8333,7 @@ definitions:
         format: int32
         title: The seed
       extra_params:
-        type: array
-        items:
-          type: object
-          $ref: '#/definitions/v1alphaExtraParamObject'
+        type: object
         title: The extra parameters
     title: VisualQuestionAnsweringInput represents the input of visaul question answering task
     required: