subDesTagesMitExtraKaese 1 year ago
commit
479173ae2c
8 changed files with 109 additions and 0 deletions
  1. 1 0
      .dockerignore
  2. 1 0
      .gitignore
  3. 3 0
      .gitmodules
  4. 21 0
      Dockerfile
  5. 47 0
      README.md
  6. 21 0
      app/run.sh
  7. 14 0
      docker-compose.yml
  8. 1 0
      vocal-remover

+ 1 - 0
.dockerignore

@@ -0,0 +1 @@
+/data/

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+/data/

+ 3 - 0
.gitmodules

@@ -0,0 +1,3 @@
+[submodule "vocal-remover"]
+	path = vocal-remover
+	url = https://github.com/tsurumeso/vocal-remover.git

+ 21 - 0
Dockerfile

@@ -0,0 +1,21 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+RUN apt update
+RUN apt install -y build-essential
+RUN pip install -U pip setuptools wheel
+RUN pip install -U so-vits-svc-fork click
+
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y ffmpeg
+
+COPY ./vocal-remover/ /vocal-remover/
+ENV NUMBA_CACHE_DIR=/tmp
+RUN cd /vocal-remover && pip install -r requirements.txt
+
+ENV HF_HOME=/data/cache
+ENV MPLCONFIGDIR=/data/cache
+
+WORKDIR /
+COPY ./app/ /app/
+
+USER 1000:1000
+
+ENTRYPOINT [ "/app/run.sh" ]

+ 47 - 0
README.md

@@ -0,0 +1,47 @@
+# Video Dubbing with SoftVC VITS Singing Voice Conversion
+
+This is a deep-learning-based tool to clone the voice of a singer/narrator from a source video.
+
+It uses [vocal-remover](https://github.com/tsurumeso/vocal-remover) to remove the voice from the source video, and then uses
+[SoftVC VITS Singing Voice Conversion](https://github.com/voicepaw/so-vits-svc-fork) to convert the voice.
+
+## Installation
+
+### Requirements
+
+- Docker (https://docs.docker.com/get-docker/)
+- A pretrained *so-vits-svc* model (https://huggingface.co/models?search=so-vits-svc)
+- The pretrained *vocal-remover* model (https://github.com/tsurumeso/vocal-remover/)
+
+### Setup
+
+1. Clone this repository with submodules
+   ```bash
+   git clone --recursive https://gogs.justprojects.de/subDesTagesMitExtraKaese/video-dubbinig-svc.git
+   cd video-dubbinig-svc
+   ```
+2. Create the `data` folder
+   ```bash
+   mkdir -p data/output data/ingest data/models
+   ```
+3. Download the pretrained *so-vits-svc* model and place it in the `data/models` folder
+   - The path to the `G_*.pth` file should be given as the `MODEL_PATH` [environment variable](./docker-compose.yml)
+   - The path to the `config.json` file should be given as the `MODEL_CONFIG_PATH` [environment variable](./docker-compose.yml)
+4. Download the *vocal-remover* release and copy the pretrained model `models/baseline.pth` into the `vocal-remover/models` folder
+   ```bash
+   curl https://github.com/tsurumeso/vocal-remover/releases/download/v5.1.0/vocal-remover-v5.1.0.zip -o /tmp/vocal-remover.zip
+   unzip /tmp/vocal-remover.zip -d /tmp/vocal-remover
+   cp /tmp/vocal-remover/models/baseline.pth vocal-remover/models/
+   rm -rf /tmp/vocal-remover /tmp/vocal-remover.zip
+   ```
+
+5. Build the docker image
+   ```bash
+   docker compose build
+   ```
+6. Insert your source video into the `data/ingest` folder
+7. Run the docker image
+   ```bash
+    docker compose up
+   ```
+8. The output video will be in the `data/output` folder

+ 21 - 0
app/run.sh

@@ -0,0 +1,21 @@
+#!/bin/bash
+
+mkdir -p /data/output /data/ingest /data/models /data/temp /data/cache
+rm -rf /data/temp/*
+
+# For each video file in the ingest directory
+for f in /data/ingest/*; do
+    # Get file name without extension
+    filename="$(basename "${f%.*}")"
+    # Extract audio from video file
+    ffmpeg -loglevel warning -y -i /data/ingest/"${f##*/}" -f wav /data/temp/"$filename".wav
+    # Split voice and music from audio file
+    python3 /vocal-remover/inference.py -i /data/temp/"$filename".wav -o /data/temp/ --tta --pretrained_model /vocal-remover/models/baseline.pth
+    # Clone the voice
+    svc infer -o /data/temp/"$filename"_"$SPEAKER".wav -m "$MODEL_PATH" -c "$MODEL_CONFIG_PATH" -s "$SPEAKER" /data/temp/"$filename"_Vocals.wav
+    # Combine voice and music into one mp3 file
+    ffmpeg -loglevel warning -y -i /data/temp/"$filename"_"$SPEAKER".wav -i /data/temp/"$filename"_Instruments.wav \
+        -filter_complex "[0:a]volume=$VOCALS_VOLUME[a0];[a0][1:a]amix=inputs=2:duration=longest" /data/temp/"$filename"_"$SPEAKER"_combined.wav
+    # Combine audio and video into one file
+    ffmpeg -loglevel warning -y -i /data/ingest/"${f##*/}" -i /data/temp/"$filename"_"$SPEAKER"_combined.wav -c:v copy -map 0:v:0 -map 1:a:0 /data/output/"$filename".mp4
+done

+ 14 - 0
docker-compose.yml

@@ -0,0 +1,14 @@
+version: "3.7"
+services:
+  video-dubbing-svc:
+    build: .
+    container_name: video-dubbing-svc
+
+    volumes:
+      - ./data:/data
+
+    environment:
+      - "MODEL_PATH=/data/models/hanzo_ana/G_460.pth"
+      - "MODEL_CONFIG_PATH=/data/models/hanzo_ana/config.json"
+      - "SPEAKER=hanzo"
+      - "VOCALS_VOLUME=1.5"

+ 1 - 0
vocal-remover

@@ -0,0 +1 @@
+Subproject commit 8a02fc5fdc88419524bb7f1e7f50e99eb300615f