diff --git a/.gitignore b/.gitignore
index 801935a6..506d5d35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,7 +157,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
examples/results/*
gfpgan/*
@@ -166,4 +166,7 @@ assets/*
results/*
Dockerfile
start_docker.sh
-start.sh
\ No newline at end of file
+start.sh
+
+# Mac
+.DS_Store
diff --git a/README.md b/README.md
index af83e996..7724c7cf 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,7 @@
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)
-
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
Wenxuan Zhang *,1,2
@@ -121,9 +120,10 @@ Tutorials from communities: [ä¸æ–‡windows教程](https://www.bilibili.com/video
### Windows ([ä¸æ–‡windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
-2. Install [git](https://git-scm.com/download/win).
-3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows).
+2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
+3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
+5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
### Macbook:
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 00000000..05bcbd58
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,35 @@
+build:
+ gpu: true
+ cuda: "11.3"
+ python_version: "3.8"
+ system_packages:
+ - "ffmpeg"
+ - "libgl1-mesa-glx"
+ - "libglib2.0-0"
+ python_packages:
+ - "torch==1.12.1"
+ - "torchvision==0.13.1"
+ - "torchaudio==0.12.1"
+ - "joblib==1.1.0"
+ - "scikit-image==0.19.3"
+ - "basicsr==1.4.2"
+ - "facexlib==0.3.0"
+ - "resampy==0.3.1"
+ - "pydub==0.25.1"
+ - "scipy==1.10.1"
+ - "kornia==0.6.8"
+ - "face_alignment==1.3.5"
+ - "imageio==2.19.3"
+ - "imageio-ffmpeg==0.4.7"
+ - "librosa==0.9.2" #
+ - "tqdm==4.65.0"
+ - "yacs==0.1.8"
+ - "gfpgan==1.3.8"
+ - "dlib-bin==19.24.1"
+ - "av==10.0.0"
+ - "trimesh==3.9.20"
+ run:
+ - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
+ - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip"
+
+predict: "predict.py:Predictor"
diff --git a/docs/FAQ.md b/docs/FAQ.md
index fe758809..6451a226 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -26,3 +26,21 @@ Make sure you have downloaded the checkpoints and gfpgan as [here](https://githu
**Q: RuntimeError: unexpected EOF, expected 237192 more bytes. The file might be corrupted.**
The files are not automatically downloaded. Please update the code and download the gfpgan folders as [here](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
+
+**Q: CUDA out of memory error**
+
+please refer to https://stackoverflow.com/questions/73747731/runtimeerror-cuda-out-of-memory-how-setting-max-split-size-mb
+
+```
+# windows
+set PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+python inference.py ...
+
+# linux
+export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+python inference.py ...
+```
+
+**Q: Error while decoding stream #0:0: Invalid data found when processing input [mp3float @ 0000015037628c00] Header missing**
+
+Our method only support wav or mp3 files as input, please make sure the feeded audios are in these formats.
diff --git a/predict.py b/predict.py
new file mode 100644
index 00000000..1bfcd28e
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,192 @@
+"""run bash scripts/download_models.sh first to prepare the weights file"""
+import os
+import shutil
+from argparse import Namespace
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+from src.utils.init_path import init_path
+from cog import BasePredictor, Input, Path
+
+checkpoints = "checkpoints"
+
+
+class Predictor(BasePredictor):
+ def setup(self):
+ """Load the model into memory to make running multiple predictions efficient"""
+ device = "cuda"
+
+
+ sadtalker_paths = init_path(checkpoints,os.path.join("src","config"))
+
+ # init model
+ self.preprocess_model = CropAndExtract(sadtalker_paths, device
+ )
+
+ self.audio_to_coeff = Audio2Coeff(
+ sadtalker_paths,
+ device,
+ )
+
+ self.animate_from_coeff = {
+ "full": AnimateFromCoeff(
+ sadtalker_paths,
+ device,
+ ),
+ "others": AnimateFromCoeff(
+ sadtalker_paths,
+ device,
+ ),
+ }
+
+ def predict(
+ self,
+ source_image: Path = Input(
+ description="Upload the source image, it can be video.mp4 or picture.png",
+ ),
+ driven_audio: Path = Input(
+ description="Upload the driven audio, accepts .wav and .mp4 file",
+ ),
+ enhancer: str = Input(
+ description="Choose a face enhancer",
+ choices=["gfpgan", "RestoreFormer"],
+ default="gfpgan",
+ ),
+ preprocess: str = Input(
+ description="how to preprocess the images",
+ choices=["crop", "resize", "full"],
+ default="full",
+ ),
+ ref_eyeblink: Path = Input(
+ description="path to reference video providing eye blinking",
+ default=None,
+ ),
+ ref_pose: Path = Input(
+ description="path to reference video providing pose",
+ default=None,
+ ),
+ still: bool = Input(
+ description="can crop back to the original videos for the full body aniamtion when preprocess is full",
+ default=True,
+ ),
+ ) -> Path:
+ """Run a single prediction on the model"""
+
+ animate_from_coeff = (
+ self.animate_from_coeff["full"]
+ if preprocess == "full"
+ else self.animate_from_coeff["others"]
+ )
+
+ args = load_default()
+ args.pic_path = str(source_image)
+ args.audio_path = str(driven_audio)
+ device = "cuda"
+ args.still = still
+ args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink)
+ args.ref_pose = None if ref_pose is None else str(ref_pose)
+
+ # crop image and extract 3dmm from image
+ results_dir = "results"
+ if os.path.exists(results_dir):
+ shutil.rmtree(results_dir)
+ os.makedirs(results_dir)
+ first_frame_dir = os.path.join(results_dir, "first_frame_dir")
+ os.makedirs(first_frame_dir)
+
+ print("3DMM Extraction for source image")
+ first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
+ args.pic_path, first_frame_dir, preprocess, source_image_flag=True
+ )
+ if first_coeff_path is None:
+ print("Can't get the coeffs of the input")
+ return
+
+ if ref_eyeblink is not None:
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[
+ 0
+ ]
+ ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname)
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
+ print("3DMM Extraction for the reference video providing eye blinking")
+ ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate(
+ ref_eyeblink, ref_eyeblink_frame_dir
+ )
+ else:
+ ref_eyeblink_coeff_path = None
+
+ if ref_pose is not None:
+ if ref_pose == ref_eyeblink:
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
+ else:
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
+ ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname)
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
+ print("3DMM Extraction for the reference video providing pose")
+ ref_pose_coeff_path, _, _ = self.preprocess_model.generate(
+ ref_pose, ref_pose_frame_dir
+ )
+ else:
+ ref_pose_coeff_path = None
+
+ # audio2ceoff
+ batch = get_data(
+ first_coeff_path,
+ args.audio_path,
+ device,
+ ref_eyeblink_coeff_path,
+ still=still,
+ )
+ coeff_path = self.audio_to_coeff.generate(
+ batch, results_dir, args.pose_style, ref_pose_coeff_path
+ )
+ # coeff2video
+ print("coeff2video")
+ data = get_facerender_data(
+ coeff_path,
+ crop_pic_path,
+ first_coeff_path,
+ args.audio_path,
+ args.batch_size,
+ args.input_yaw,
+ args.input_pitch,
+ args.input_roll,
+ expression_scale=args.expression_scale,
+ still_mode=still,
+ preprocess=preprocess,
+ )
+ animate_from_coeff.generate(
+ data, results_dir, args.pic_path, crop_info,
+ enhancer=enhancer, background_enhancer=args.background_enhancer,
+ preprocess=preprocess)
+
+ output = "/tmp/out.mp4"
+ mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0])
+ shutil.copy(mp4_path, output)
+
+ return Path(output)
+
+
+def load_default():
+ return Namespace(
+ pose_style=0,
+ batch_size=2,
+ expression_scale=1.0,
+ input_yaw=None,
+ input_pitch=None,
+ input_roll=None,
+ background_enhancer=None,
+ face3dvis=False,
+ net_recon="resnet50",
+ init_path=None,
+ use_last_fc=False,
+ bfm_folder="./src/config/",
+ bfm_model="BFM_model_front.mat",
+ focal=1015.0,
+ center=112.0,
+ camera_d=10.0,
+ z_near=5.0,
+ z_far=15.0,
+ )
diff --git a/src/facerender/animate.py b/src/facerender/animate.py
index 563d87fe..85583157 100644
--- a/src/facerender/animate.py
+++ b/src/facerender/animate.py
@@ -206,7 +206,8 @@ def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, backgr
audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
new_audio_path = os.path.join(video_save_dir, audio_name+'.wav')
start_time = 0
- sound = AudioSegment.from_mp3(audio_path)
+ # cog will not keep the .mp3 filename
+ sound = AudioSegment.from_file(audio_path)
frames = frame_num
end_time = start_time + frames*1/25*1000
word1=sound.set_frame_rate(16000)