docs(ai): add api docs for audio-to-text pipeline (#594)

* Add api docs for speech-to-text * Capitalize title * Update supported file types * Update recommended price per unit * Update docs for audio-to-text * update file types and request limit, sort menu items * docs(ai): apply small audio-to-text improvements This commit applies some small audio-to-text documentation improvements. --------- Co-authored-by: Rick Staa <[email protected]>
livepeer · Jul 17, 2024 · 1fc870e · 1fc870e
1 parent 509f415
commit 1fc870e
Show file tree

Hide file tree

Showing 5 changed files with 242 additions and 8 deletions.
diff --git a/ai/api-reference/ai-openapi-schema.yml b/ai/api-reference/ai-openapi-schema.yml
@@ -155,6 +155,55 @@ paths:
                 $ref: '#/components/schemas/HTTPValidationError'
       security:
       - HTTPBearer: []
+  /audio-to-text:
+    post:
+      summary: Audio To Text
+      operationId: audio_to_text
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: '#/components/schemas/Body_audio_to_text_audio_to_text_post'
+        required: true
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/TextResponse'
+        '400':
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '401':
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '413':
+          description: Request Entity Too Large
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '500':
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+      security:
+      - HTTPBearer: []
 components:
   schemas:
     APIError:
@@ -166,6 +215,21 @@ components:
       required:
       - msg
       title: APIError
+    Body_audio_to_text_audio_to_text_post:
+      properties:
+        audio:
+          type: string
+          format: binary
+          title: Audio
+        model_id:
+          type: string
+          title: Model Id
+          default: ''
+      type: object
+      required:
+      - audio
+      - model_id
+      title: Body_audio_to_text_audio_to_text_post
     Body_image_to_image_image_to_image_post:
       properties:
         prompt:
@@ -333,6 +397,21 @@ components:
       - seed
       - nsfw
       title: Media
+    TextResponse:
+      properties:
+        text:
+          type: string
+          title: Text
+        chunks:
+          items:
+            $ref: '#/components/schemas/chunk'
+          type: array
+          title: Chunks
+      type: object
+      required:
+      - text
+      - chunks
+      title: TextResponse
     TextToImageParams:
       properties:
         model_id:
@@ -399,17 +478,29 @@ components:
       title: ValidationError
     VideoResponse:
       properties:
-        frames:
+        images:
           items:
-            items:
-              $ref: '#/components/schemas/Media'
-            type: array
+            $ref: '#/components/schemas/Media'
           type: array
-          title: Frames
+          title: Images
       type: object
       required:
-      - frames
+      - images
       title: VideoResponse
+    chunk:
+      properties:
+        timestamp:
+          items: {}
+          type: array
+          title: Timestamp
+        text:
+          type: string
+          title: Text
+      type: object
+      required:
+      - timestamp
+      - text
+      title: chunk
   securitySchemes:
     HTTPBearer:
       type: http

diff --git a/ai/api-reference/audio-to-text.mdx b/ai/api-reference/audio-to-text.mdx
@@ -0,0 +1,21 @@
+---
+openapi: post /audio-to-text
+---
+
+<Info>
+  The public [Livepeer.cloud](https://www.livepeer.cloud/) Gateway used in this
+  guide is intended for experimentation and is not guaranteed for production
+  use. It is a free, non-token-gated, but rate-limited service designed for
+  testing purposes. For production-ready applications, consider setting up your
+  own Gateway node or partnering with one via the `ai-video` channel on
+  [Discord](https://discord.gg/livepeer).
+</Info>
+
+<Note>
+  Please note that the **optimal** parameters for a given model may vary
+  depending on the specific model and use case. The parameters provided in this
+  guide are not model-specific and should be used as a starting point.
+  Additionally, some models may have parameters such as `guiding_scale` and
+  `num_inference_steps` disabled by default. For more information on
+  model-specific parameters, please refer to the respective model documentation.
+</Note>
diff --git a/ai/orchestrators/models-config.mdx b/ai/orchestrators/models-config.mdx
@@ -31,7 +31,12 @@ currently **recommended** models and their respective prices.
     "pipeline": "upscale",
     "model_id": "stabilityai/stable-diffusion-x4-upscaler",
     "price_per_unit": 4768371,
-  }
+  },
+  {
+    "pipeline": "audio-to-text",
+    "model_id": "openai/whisper-large-v3",
+    "price_per_unit": 12882811,
+  },
   {
     "pipeline": "image-to-video",
     "model_id": "stabilityai/stable-video-diffusion-img2vid-xt-1-1",

diff --git a/ai/pipelines/audio-to-text.mdx b/ai/pipelines/audio-to-text.mdx
@@ -0,0 +1,115 @@
+---
+title: Audio-to-Text
+---
+
+## Overview
+
+The `audio-to-text` pipeline converts audio from media files into text,
+utilizing cutting-edge diffusion models from HuggingFace's
+[automatic-speech-recognition (ASR) pipeline](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition).
+
+<div align="center">
+
+</div>
+
+## Models
+
+### Warm Models
+
+The current warm model requested for the `audio-to-text` pipeline is:
+
+- [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3):
+  Whisper is a pre-trained model for automatic speech recognition (ASR) and
+  speech translation.
+
+<Tip>
+  For faster responses with different
+  [audio-to-text](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition)
+  diffusion models, ask Orchestrators to load it on their GPU via the `ai-video`
+  channel in [Discord Server](https://discord.gg/livepeer).
+</Tip>
+
+### On-Demand Models
+
+The following models have been tested and verified for the `audio-to-text`
+pipeline:
+
+<Note>
+  If a specific model you wish to use is not listed, please submit a [feature
+  request](https://github.com/livepeer/ai-worker/issues/new?assignees=&labels=enhancement%2Cmodel&projects=&template=model_request.yml)
+  on GitHub to get the model verified and added to the list.
+</Note>
+
+{/* prettier-ignore */}
+<Accordion title="Tested and Verified Diffusion Models">
+- [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3): A high-performance
+  ASR model by Open AI.
+
+</Accordion>
+
+## Basic Usage Instructions
+
+<Tip>
+  For a detailed understanding of the `audio-to-text` endpoint and to experiment
+  with the API, see the [AI Subnet API
+  Reference](/ai/api-reference/audio-to-text).
+</Tip>
+
+To create an audio transcript using the `audio-to-text` pipeline, submit a
+`POST` request to the Gateway's `audio-to-text` API endpoint:
+
+```bash
+curl -X POST "https://<gateway-ip>/audio-to-text" \
+    -F model_id=openai/whisper-large-v3 \
+    -F audio=@<PATH_TO_FILE>
+```
+
+In this command:
+
+- `<gateway-ip>` should be replaced with your AI Gateway's IP address.
+- `model_id` is the diffusion model for image generation.
+- `audio` is the path to the audio file to be transcribed.
+
+<Note>
+  - Supported file types: `mp4`, `webm`, `mp3`, `flac`, `wav` and `m4a` -
+  Maximum request size: 50 MB
+</Note>
+
+For additional optional parameters, refer to the
+[AI Subnet API Reference](/ai/api-reference/audio-to-text).
+
+After execution, the Orchestrator processes the request and returns the response
+to the Gateway:
+
+```json
+{
+    "chunks": [
+        {
+            "text": " Explore the power of automatic speech recognition",
+            "timestamp": [
+                0,
+                1.35
+            ]
+        },
+        {
+            "text": " By extracting the text from audio",
+            "timestamp": [
+                1.35
+                2.07
+            ]
+        }
+    ],
+    "text": " Explore the power of automatic speech recognition By extracting the text from audio"
+}
+```
+
+## API Reference
+
+<Card
+  title="API Reference"
+  icon="rectangle-terminal"
+  href="/ai/api-reference/audio-to-text"
+>
+  Explore the `audio-to-text` endpoint and experiment with the API in the AI
+  Subnet API Reference.
+</Card>
diff --git a/mint.json b/mint.json
@@ -523,6 +523,7 @@
           "iconType": "solid",
           "pages": [
             "ai/pipelines/overview",
+            "ai/pipelines/audio-to-text",
             "ai/pipelines/text-to-image",
             "ai/pipelines/image-to-image",
             "ai/pipelines/image-to-video",
@@ -570,7 +571,8 @@
             "ai/api-reference/text-to-image",
             "ai/api-reference/image-to-image",
             "ai/api-reference/image-to-video",
-            "ai/api-reference/upscale"
+            "ai/api-reference/upscale",
+            "ai/api-reference/audio-to-text"
           ]
         }
       ]