One of the easiest and cheapest way to host on-demand API is via Modal.
Once you create a free account and download the Python client, copy or download the base code below. It's on github too: Code
from modal import Image, Stub, method, NetworkFileSystem, asgi_app
from fastapi import Request, FastAPI
import tempfile
import time
MODEL_DIR = "/model"
web_app = FastAPI()
def download_model():
from huggingface_hub import snapshot_download
snapshot_download("openai/whisper-large-v3", local_dir=MODEL_DIR)
image = (
Image.from_registry("nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04", add_python="3.9")
.apt_install("git","ffmpeg")
.pip_install(
"transformers",
"ninja",
"packaging",
"wheel",
"torch",
"hf-transfer~=0.1",
"ffmpeg-python",
).run_commands("python -m pip install flash-attn --no-build-isolation", gpu="A10G")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model,
)
)
stub = Stub("transcribe-x", image=image)
stub.net_file_system = NetworkFileSystem.new()
@stub.cls(
gpu="A10G",
allow_concurrent_inputs=80,
container_idle_timeout=40,
network_file_systems={"/audio_files": stub.net_file_system},
)
class WhisperV3:
def __enter__(self):
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_DIR,
torch_dtype=self.torch_dtype,
use_safetensors=True,
use_flash_attention_2=True,
)
processor = AutoProcessor.from_pretrained(MODEL_DIR)
model.to(self.device)
self.pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=24,
return_timestamps=True,
torch_dtype=self.torch_dtype,
model_kwargs={"use_flash_attention_2": True},
device=0,
)
@method()
def generate(self, audio: bytes):
fp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
fp.write(audio)
fp.close()
start = time.time()
output = self.pipe(
fp.name, chunk_length_s=30, batch_size=24, return_timestamps=True
)
elapsed = time.time() - start
return output, elapsed
@stub.function()
@web_app.post("/")
async def transcribe(request: Request):
form = await request.form()
audio = await form["audio"].read()
output, elapsed= WhisperV3().generate.remote(audio)
return output, elapsed
@stub.function()
@asgi_app()
def entrypoint():
return web_app
After authenticating with the Modal CLI, run this in your terminal:
$modal deploy modal_app.py
Now you can make requests! Remember to fill in the missing info:
$curl -X POST -F "audio=@<file>" https://<org_name>--transcribe-x-entrypoint.modal.run