1 year ago · 479173ae2c
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
 
				+/data/
			
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 
				+/data/
			
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
 
				+[submodule "vocal-remover"]
			
 
				+	path = vocal-remover
			
 
				+	url = https://github.com/tsurumeso/vocal-remover.git
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,21 @@
 
				+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
			
 
				+RUN apt update
			
 
				+RUN apt install -y build-essential
			
 
				+RUN pip install -U pip setuptools wheel
			
 
				+RUN pip install -U so-vits-svc-fork click
			
 
				+
			
 
				+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y ffmpeg
			
 
				+
			
 
				+COPY ./vocal-remover/ /vocal-remover/
			
 
				+ENV NUMBA_CACHE_DIR=/tmp
			
 
				+RUN cd /vocal-remover && pip install -r requirements.txt
			
 
				+
			
 
				+ENV HF_HOME=/data/cache
			
 
				+ENV MPLCONFIGDIR=/data/cache
			
 
				+
			
 
				+WORKDIR /
			
 
				+COPY ./app/ /app/
			
 
				+
			
 
				+USER 1000:1000
			
 
				+
			
 
				+ENTRYPOINT [ "/app/run.sh" ]
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,47 @@
 
				+# Video Dubbing with SoftVC VITS Singing Voice Conversion
			
 
				+
			
 
				+This is a deep-learning-based tool to clone the voice of a singer/narrator from a source video.
			
 
				+
			
 
				+It uses [vocal-remover](https://github.com/tsurumeso/vocal-remover) to remove the voice from the source video, and then uses
			
 
				+[SoftVC VITS Singing Voice Conversion](https://github.com/voicepaw/so-vits-svc-fork) to convert the voice.
			
 
				+
			
 
				+## Installation
			
 
				+
			
 
				+### Requirements
			
 
				+
			
 
				+- Docker (https://docs.docker.com/get-docker/)
			
 
				+- A pretrained *so-vits-svc* model (https://huggingface.co/models?search=so-vits-svc)
			
 
				+- The pretrained *vocal-remover* model (https://github.com/tsurumeso/vocal-remover/)
			
 
				+
			
 
				+### Setup
			
 
				+
			
 
				+1. Clone this repository with submodules
			
 
				+   ```bash
			
 
				+   git clone --recursive https://gogs.justprojects.de/subDesTagesMitExtraKaese/video-dubbinig-svc.git
			
 
				+   cd video-dubbinig-svc
			
 
				+   ```
			
 
				+2. Create the `data` folder
			
 
				+   ```bash
			
 
				+   mkdir -p data/output data/ingest data/models
			
 
				+   ```
			
 
				+3. Download the pretrained *so-vits-svc* model and place it in the `data/models` folder
			
 
				+   - The path to the `G_*.pth` file should be given as the `MODEL_PATH` [environment variable](./docker-compose.yml)
			
 
				+   - The path to the `config.json` file should be given as the `MODEL_CONFIG_PATH` [environment variable](./docker-compose.yml)
			
 
				+4. Download the *vocal-remover* release and copy the pretrained model `models/baseline.pth` into the `vocal-remover/models` folder
			
 
				+   ```bash
			
 
				+   curl https://github.com/tsurumeso/vocal-remover/releases/download/v5.1.0/vocal-remover-v5.1.0.zip -o /tmp/vocal-remover.zip
			
 
				+   unzip /tmp/vocal-remover.zip -d /tmp/vocal-remover
			
 
				+   cp /tmp/vocal-remover/models/baseline.pth vocal-remover/models/
			
 
				+   rm -rf /tmp/vocal-remover /tmp/vocal-remover.zip
			
 
				+   ```
			
 
				+
			
 
				+5. Build the docker image
			
 
				+   ```bash
			
 
				+   docker compose build
			
 
				+   ```
			
 
				+6. Insert your source video into the `data/ingest` folder
			
 
				+7. Run the docker image
			
 
				+   ```bash
			
 
				+    docker compose up
			
 
				+   ```
			
 
				+8. The output video will be in the `data/output` folder
			
--- a/app/run.sh
+++ b/app/run.sh
@@ -0,0 +1,21 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+mkdir -p /data/output /data/ingest /data/models /data/temp /data/cache
			
 
				+rm -rf /data/temp/*
			
 
				+
			
 
				+# For each video file in the ingest directory
			
 
				+for f in /data/ingest/*; do
			
 
				+    # Get file name without extension
			
 
				+    filename="$(basename "${f%.*}")"
			
 
				+    # Extract audio from video file
			
 
				+    ffmpeg -loglevel warning -y -i /data/ingest/"${f##*/}" -f wav /data/temp/"$filename".wav
			
 
				+    # Split voice and music from audio file
			
 
				+    python3 /vocal-remover/inference.py -i /data/temp/"$filename".wav -o /data/temp/ --tta --pretrained_model /vocal-remover/models/baseline.pth
			
 
				+    # Clone the voice
			
 
				+    svc infer -o /data/temp/"$filename"_"$SPEAKER".wav -m "$MODEL_PATH" -c "$MODEL_CONFIG_PATH" -s "$SPEAKER" /data/temp/"$filename"_Vocals.wav
			
 
				+    # Combine voice and music into one mp3 file
			
 
				+    ffmpeg -loglevel warning -y -i /data/temp/"$filename"_"$SPEAKER".wav -i /data/temp/"$filename"_Instruments.wav \
			
 
				+        -filter_complex "[0:a]volume=$VOCALS_VOLUME[a0];[a0][1:a]amix=inputs=2:duration=longest" /data/temp/"$filename"_"$SPEAKER"_combined.wav
			
 
				+    # Combine audio and video into one file
			
 
				+    ffmpeg -loglevel warning -y -i /data/ingest/"${f##*/}" -i /data/temp/"$filename"_"$SPEAKER"_combined.wav -c:v copy -map 0:v:0 -map 1:a:0 /data/output/"$filename".mp4
			
 
				+done
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,14 @@
 
				+version: "3.7"
			
 
				+services:
			
 
				+  video-dubbing-svc:
			
 
				+    build: .
			
 
				+    container_name: video-dubbing-svc
			
 
				+
			
 
				+    volumes:
			
 
				+      - ./data:/data
			
 
				+
			
 
				+    environment:
			
 
				+      - "MODEL_PATH=/data/models/hanzo_ana/G_460.pth"
			
 
				+      - "MODEL_CONFIG_PATH=/data/models/hanzo_ana/config.json"
			
 
				+      - "SPEAKER=hanzo"
			
 
				+      - "VOCALS_VOLUME=1.5"
			
--- a/vocal-remover
+++ b/vocal-remover
@@ -0,0 +1 @@
 
				+Subproject commit 8a02fc5fdc88419524bb7f1e7f50e99eb300615f
		`@@ -0,0 +1 @@`
		`+Subproject commit 8a02fc5fdc88419524bb7f1e7f50e99eb300615f`