Upload 67 files

This commit is contained in:
liu 2023-01-12 16:57:54 +00:00 committed by huggingface-web
parent 26194236d6
commit 1c75048c96
67 changed files with 6141 additions and 0 deletions

38
config.py Normal file
View File

@ -0,0 +1,38 @@
############离线VC参数
inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换,别放非音频文件
opt_root=r"opt"#输出目录
f0_up_key=0#升降调整数男转女12女转男-12
person=r"weights\洛天依v3.pt"#目前只有洛天依v3
############硬件参数
device = "cuda:0"#填写cuda:x或cpux指代第几张卡只支持N卡加速
is_half=True#9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速
n_cpu=0#默认0用上所有线程写数字限制CPU资源使用
############下头别动
import torch
if(torch.cuda.is_available()==False):
print("没有发现支持的N卡使用CPU进行推理")
device="cpu"
is_half=False
if(device!="cpu"):
gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1]))
if("16"in gpu_name):
print("16系显卡强制单精度")
is_half=False
from multiprocessing import cpu_count
if(n_cpu==0):n_cpu=cpu_count()
if(is_half==True):
#6G显存配置
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
else:
#5G显存配置
x_pad = 1
# x_query = 6
# x_center = 30
# x_max = 32
#6G显存配置
x_query = 6
x_center = 38
x_max = 41

1
go-web.bat Normal file
View File

@ -0,0 +1 @@
runtime\python.exe infer-web.py

1
go.bat Normal file
View File

@ -0,0 +1 @@
runtime\python.exe infer.py

3
hubert_base.pt Normal file
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
size 189507909

193
infer-web.py Normal file
View File

@ -0,0 +1,193 @@
import torch, pdb, os,traceback,sys,warnings,shutil
now_dir=os.getcwd()
sys.path.append(now_dir)
tmp=os.path.join(now_dir,"TEMP")
shutil.rmtree(tmp,ignore_errors=True)
os.makedirs(tmp,exist_ok=True)
os.environ["TEMP"]=tmp
warnings.filterwarnings("ignore")
torch.manual_seed(114514)
from infer_pack.models import SynthesizerTrnMs256NSF as SynthesizerTrn256
from scipy.io import wavfile
from fairseq import checkpoint_utils
import gradio as gr
import librosa
import logging
from vc_infer_pipeline import VC
import soundfile as sf
from config import is_half,device,is_half
from infer_uvr5 import _audio_pre_
logging.getLogger('numba').setLevel(logging.WARNING)
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
if(is_half):hubert_model = hubert_model.half()
else:hubert_model = hubert_model.float()
hubert_model.eval()
weight_root="weights"
weight_uvr5_root="uvr5_weights"
names=[]
for name in os.listdir(weight_root):names.append(name.replace(".pt",""))
uvr5_names=[]
for name in os.listdir(weight_uvr5_root):uvr5_names.append(name.replace(".pth",""))
def get_vc(sid):
person = "%s/%s.pt" % (weight_root, sid)
cpt = torch.load(person, map_location="cpu")
dv = cpt["dv"]
tgt_sr = cpt["config"][-1]
net_g = SynthesizerTrn256(*cpt["config"], is_half=is_half)
net_g.load_state_dict(cpt["weight"], strict=True)
net_g.eval().to(device)
if (is_half):net_g = net_g.half()
else:net_g = net_g.float()
vc = VC(tgt_sr, device, is_half)
return dv,tgt_sr,net_g,vc
def vc_single(sid,input_audio,f0_up_key,f0_file):
if input_audio is None:return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
if(type(input_audio)==str):
print("processing %s" % input_audio)
audio, sampling_rate = sf.read(input_audio)
else:
sampling_rate, audio = input_audio
audio = audio.astype("float32") / 32768
if(type(sid)==str):dv, tgt_sr, net_g, vc=get_vc(sid)
else:dv,tgt_sr,net_g,vc=sid
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
times = [0, 0, 0]
audio_opt=vc.pipeline(hubert_model,net_g,dv,audio,times,f0_up_key,f0_file=f0_file)
print(times)
return "Success", (tgt_sr, audio_opt)
except:
info=traceback.format_exc()
print(info)
return info,(None,None)
finally:
print("clean_empty_cache")
del net_g,dv,vc
torch.cuda.empty_cache()
def vc_multi(sid,dir_path,opt_root,paths,f0_up_key):
try:
dir_path=dir_path.strip(" ")#防止小白拷路径头尾带了空格
opt_root=opt_root.strip(" ")
os.makedirs(opt_root, exist_ok=True)
dv, tgt_sr, net_g, vc = get_vc(sid)
try:
if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)]
else:paths=[path.name for path in paths]
except:
traceback.print_exc()
paths = [path.name for path in paths]
infos=[]
for path in paths:
info,opt=vc_single([dv,tgt_sr,net_g,vc],path,f0_up_key,f0_file=None)
if(info=="Success"):
try:
tgt_sr,audio_opt=opt
wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt)
except:
info=traceback.format_exc()
infos.append("%s->%s"%(os.path.basename(path),info))
return "\n".join(infos)
except:
return traceback.format_exc()
finally:
print("clean_empty_cache")
del net_g,dv,vc
torch.cuda.empty_cache()
def uvr(model_name,inp_root,save_root_vocal,save_root_ins):
infos = []
try:
inp_root = inp_root.strip(" ")# 防止小白拷路径头尾带了空格
save_root_vocal = save_root_vocal.strip(" ")
save_root_ins = save_root_ins.strip(" ")
pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half)
for name in os.listdir(inp_root):
inp_path=os.path.join(inp_root,name)
try:
pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal)
infos.append("%s->Success"%(os.path.basename(inp_path)))
except:
infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc()))
except:
infos.append(traceback.format_exc())
finally:
try:
del pre_fun.model
del pre_fun
except:
traceback.print_exc()
print("clean_empty_cache")
torch.cuda.empty_cache()
return "\n".join(infos)
with gr.Blocks() as app:
with gr.Tabs():
with gr.TabItem("推理"):
with gr.Group():
gr.Markdown(value="""
使用软件者传播软件导出的声音者自负全责如不认可该条款则不能使用/引用软件包内所有代码和文件<br>
目前仅开放白菜音色后续将扩展为本地训练推理工具用户可训练自己的音色进行社区共享<br>
男转女推荐+12key女转男推荐-12key如果音域爆炸导致音色失真也可以自己调整到合适音域
""")
with gr.Row():
with gr.Column():
sid0 = gr.Dropdown(label="音色", choices=names)
vc_transform0 = gr.Number(label="变调整数半音数量升八度12降八度-12", value=12)
f0_file = gr.File(label="F0曲线文件可选一行一个音高代替默认F0及升降调")
input_audio0 = gr.Audio(label="上传音频")
but0=gr.Button("转换", variant="primary")
with gr.Column():
vc_output1 = gr.Textbox(label="输出信息")
vc_output2 = gr.Audio(label="输出音频")
but0.click(vc_single, [sid0, input_audio0, vc_transform0,f0_file], [vc_output1, vc_output2])
with gr.Group():
gr.Markdown(value="""
批量转换上传多个音频文件在指定文件夹默认opt下输出转换的音频<br>
合格的文件夹路径格式举例E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例去文件管理器地址栏拷就行了
""")
with gr.Row():
with gr.Column():
sid1 = gr.Dropdown(label="音色", choices=names)
vc_transform1 = gr.Number(label="变调整数半音数量升八度12降八度-12", value=12)
opt_input = gr.Textbox(label="指定输出文件夹",value="opt")
with gr.Column():
dir_input = gr.Textbox(label="输入待处理音频文件夹路径")
inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
but1=gr.Button("转换", variant="primary")
vc_output3 = gr.Textbox(label="输出信息")
but1.click(vc_multi, [sid1, dir_input,opt_input,inputs, vc_transform1], [vc_output3])
with gr.TabItem("数据处理"):
with gr.Group():
gr.Markdown(value="""
人声伴奏分离批量处理使用UVR5模型<br>
不带和声用HP2带和声且提取的人声不需要和声用HP5<br>
合格的文件夹路径格式举例E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例去文件管理器地址栏拷就行了
""")
with gr.Row():
with gr.Column():
dir_wav_input = gr.Textbox(label="输入待处理音频文件夹路径")
wav_inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
with gr.Column():
model_choose = gr.Dropdown(label="模型", choices=uvr5_names)
opt_vocal_root = gr.Textbox(label="指定输出人声文件夹",value="opt")
opt_ins_root = gr.Textbox(label="指定输出乐器文件夹",value="opt")
but2=gr.Button("转换", variant="primary")
vc_output4 = gr.Textbox(label="输出信息")
but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,opt_ins_root], [vc_output4])
with gr.TabItem("训练-待开放"):pass
# app.launch(server_name="0.0.0.0",server_port=7860)
app.launch(server_name="127.0.0.1",server_port=7860)

48
infer.py Normal file
View File

@ -0,0 +1,48 @@
import torch, pdb, os,sys,librosa,warnings,traceback
warnings.filterwarnings("ignore")
torch.manual_seed(114514)
sys.path.append(os.getcwd())
from config import inp_root,opt_root,f0_up_key,person,is_half,device
os.makedirs(opt_root,exist_ok=True)
import soundfile as sf
from infer_pack.models import SynthesizerTrnMs256NSF as SynthesizerTrn256
from scipy.io import wavfile
from fairseq import checkpoint_utils
import scipy.signal as signal
from vc_infer_pipeline import VC
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
model = models[0]
model = model.to(device)
if(is_half):model = model.half()
else:model = model.float()
model.eval()
cpt=torch.load(person,map_location="cpu")
dv=cpt["dv"]
tgt_sr=cpt["config"][-1]
net_g = SynthesizerTrn256(*cpt["config"],is_half=is_half)
net_g.load_state_dict(cpt["weight"],strict=True)
net_g.eval().to(device)
if(is_half):net_g = net_g.half()
else:net_g = net_g.float()
vc=VC(tgt_sr,device,is_half)
for name in os.listdir(inp_root):
try:
wav_path="%s\%s"%(inp_root,name)
print("processing %s"%wav_path)
audio, sampling_rate = sf.read(wav_path)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != vc.sr:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=vc.sr)
times = [0, 0, 0]
audio_opt=vc.pipeline(model,net_g,dv,audio,times,f0_up_key,f0_file=None)
wavfile.write("%s/%s"%(opt_root,name), tgt_sr, audio_opt)
except:
traceback.print_exc()
print(times)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

417
infer_pack/attentions.py Normal file
View File

@ -0,0 +1,417 @@
import copy
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from infer_pack import commons
from infer_pack import modules
from infer_pack.modules import LayerNorm
class Encoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
window_size=10,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.attn_layers[i](x, x, attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
proximal_bias=False,
proximal_init=True,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
proximal_bias=proximal_bias,
proximal_init=proximal_init,
)
)
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(
MultiHeadAttention(
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
causal=True,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
device=x.device, dtype=x.dtype
)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(
self,
channels,
out_channels,
n_heads,
p_dropout=0.0,
window_size=None,
heads_share=True,
block_length=None,
proximal_bias=False,
proximal_init=False,
):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
self.emb_rel_v = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert (
t_s == t_t
), "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(
query / math.sqrt(self.k_channels), key_relative_embeddings
)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(
device=scores.device, dtype=scores.dtype
)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert (
t_s == t_t
), "Local attention is only available for self-attention."
block_mask = (
torch.ones_like(scores)
.triu(-self.block_length)
.tril(self.block_length)
)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(
self.emb_rel_v, t_s
)
output = output + self._matmul_with_relative_values(
relative_weights, value_relative_embeddings
)
output = (
output.transpose(2, 3).contiguous().view(b, d, t_t)
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
)
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[
:, slice_start_position:slice_end_position
]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
)
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
:, :, :length, length - 1 :
]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
)
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final
def _attention_bias_proximal(self, length):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(
self,
in_channels,
out_channels,
filter_channels,
kernel_size,
p_dropout=0.0,
activation=None,
causal=False,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
if causal:
self.padding = self._causal_padding
else:
self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x

164
infer_pack/commons.py Normal file
View File

@ -0,0 +1,164 @@
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += (
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def slice_segments2(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
num_timescales - 1
)
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
device = duration.device
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path.unsqueeze(1).transpose(2, 3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1.0 / norm_type)
return total_norm

664
infer_pack/models.py Normal file
View File

@ -0,0 +1,664 @@
import math,pdb,os
from time import time as ttime
import torch
from torch import nn
from torch.nn import functional as F
from infer_pack import modules
from infer_pack import attentions
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from infer_pack.commons import init_weights
import numpy as np
from infer_pack import commons
class TextEncoder256(nn.Module):
def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class TextEncoder256km(nn.Module):
def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
# self.emb_phone = nn.Linear(256, hidden_channels)
self.emb_phone = nn.Embedding(500, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0,
):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(
modules.ResidualCouplingLayer(
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
mean_only=True,
)
)
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype
)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class Generator(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SineGen(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
# generate uv signal
uv = torch.ones_like(f0)
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0,upp):
""" sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one*=upp
tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
tmp_over_one%=1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0,is_half=True):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
self.is_half=is_half
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x,upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x,upp)
if(self.is_half==True):sine_wavs=sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge,None,None# noise, uv
class GeneratorNSF(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
sr=40000,
is_half=False
):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr,
harmonic_num=0,
is_half=is_half
)
self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
c_cur = upsample_initial_channel // (2 ** (i + 1))
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1:])
self.noise_convs.append(Conv1d(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp=np.prod(upsample_rates)
def forward(self, x, f0,g=None):
har_source, noi_source, uv = self.m_source(f0,self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
x_source = self.noise_convs[i](har_source)
x = x + x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SynthesizerTrnMs256NSF(nn.Module):
"""
Synthesizer for Training
"""
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels=0,
sr=40000,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
self.spk_embed_dim=spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
sr=sr,
is_half=kwargs["is_half"]
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Linear(self.spk_embed_dim, gin_channels)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def infer(self, phone, phone_lengths, pitch,pitchf, ds,max_len=None):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
if("float16"in str(m_p.dtype)):ds=ds.half()
ds=ds.to(m_p.device)
g = self.emb_g(ds).unsqueeze(-1) # [b, h, 1]#
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len],pitchf, g=None)
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrn256NSFkm(nn.Module):
"""
Synthesizer for Training
"""
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels=0,
sr=40000,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
self.enc_p = TextEncoder256km(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
sr=sr,
is_half=kwargs["is_half"]
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=None)
z_p = self.flow(z, y_mask, g=None)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
pitchf = commons.slice_segments2(
pitchf, ids_slice, self.segment_size
)
o = self.dec(z_slice, pitchf,g=None)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0,max_len=None):
# torch.cuda.synchronize()
# t0=ttime()
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
# torch.cuda.synchronize()
# t1=ttime()
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66) * x_mask
# torch.cuda.synchronize()
# t2=ttime()
z = self.flow(z_p, x_mask, g=None, reverse=True)
# torch.cuda.synchronize()
# t3=ttime()
o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=None)
# torch.cuda.synchronize()
# t4=ttime()
# print(1233333333333333333333333,t1-t0,t2-t1,t3-t2,t4-t3)
return o, x_mask, (z, z_p, m_p, logs_p)

522
infer_pack/modules.py Normal file
View File

@ -0,0 +1,522 @@
import copy
import math
import numpy as np
import scipy
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm
from infer_pack import commons
from infer_pack.commons import init_weights, get_padding
from infer_pack.transforms import piecewise_rational_quadratic_transform
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(
self,
in_channels,
hidden_channels,
out_channels,
kernel_size,
n_layers,
p_dropout,
):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2,
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
super().__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
self.drop = nn.Dropout(p_dropout)
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size**i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(
nn.Conv1d(
channels,
channels,
kernel_size,
groups=channels,
dilation=dilation,
padding=padding,
)
)
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(
self,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
p_dropout=0,
):
super(WN, self).__init__()
assert kernel_size % 2 == 1
self.hidden_channels = hidden_channels
self.kernel_size = (kernel_size,)
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(
gin_channels, 2 * hidden_channels * n_layers, 1
)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
for i in range(n_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(
hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding,
)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:, self.hidden_channels :, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2]),
)
),
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
]
)
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
]
)
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Log(nn.Module):
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
super().__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1, 2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=0,
gin_channels=0,
mean_only=False,
):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=p_dropout,
gin_channels=gin_channels,
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class ConvFlow(nn.Module):
def __init__(
self,
in_channels,
filter_channels,
kernel_size,
n_layers,
num_bins=10,
tail_bound=5.0,
):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
self.proj = nn.Conv1d(
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
self.filter_channels
)
unnormalized_derivatives = h[..., 2 * self.num_bins :]
x1, logabsdet = piecewise_rational_quadratic_transform(
x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails="linear",
tail_bound=self.tail_bound,
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1, 2])
if not reverse:
return x, logdet
else:
return x

193
infer_pack/transforms.py Normal file
View File

@ -0,0 +1,193 @@
import torch
from torch.nn import functional as F
import numpy as np
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {
'tails': tails,
'tail_bound': tail_bound
}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(
inputs[..., None] >= bin_locations,
dim=-1
) - 1
def unconstrained_rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails='linear',
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == 'linear':
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
unnormalized_derivatives[..., -1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError('{} tails are not implemented.'.format(tails))
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative
)
return outputs, logabsdet
def rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0., right=1., bottom=0., top=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError('Input to a transform is not within its domain')
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError('Minimal bin width too large for the number of bins')
if min_bin_height * num_bins > 1.0:
raise ValueError('Minimal bin height too large for the number of bins')
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (((inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta)
+ input_heights * (input_delta - input_derivatives)))
b = (input_heights * input_derivatives
- (inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta))
c = - input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
else:
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2)
+ input_derivatives * theta_one_minus_theta)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

108
infer_uvr5.py Normal file
View File

@ -0,0 +1,108 @@
import os,sys,torch,warnings,pdb
warnings.filterwarnings("ignore")
import librosa
import importlib
import numpy as np
import hashlib , math
from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params,inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile
class _audio_pre_():
def __init__(self, model_path,device,is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
'postprocess': False,
'tta': False,
# Constants
'window_size': 512,
'agg': 10,
'high_end_process': 'mirroring',
}
nn_arch_sizes = [
31191, # default
33966,61968, 123821, 123812, 537238 # custom
]
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path ).st_size / 1024)
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
param_name ,model_params_d = _get_name_params(model_path , model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param['bins'] * 2)
cpk = torch.load( model_path , map_location='cpu')
model.load_state_dict(cpk)
model.eval()
if(is_half==True):model = model.half().to(device)
else:model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file ,ins_root=None,vocal_root=None):
if(ins_root is None and vocal_root is None):return "No save root."
name=os.path.basename(music_file)
if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param['band'])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param['band'][d]
if d == bands_n: # high-end band
X_wave[d], _ = librosa.core.load(
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
# pdb.set_trace()
if d == bands_n and self.data['high_end_process'] != 'none':
input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data['agg']/100)
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
with torch.no_grad():
pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
# Postprocess
if self.data['postprocess']:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if (ins_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print ('%s instruments done'%name)
wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) #
if (vocal_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print ('%s vocals done'%name)
wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
if __name__ == '__main__':
device = 'cuda'
is_half=True
model_path='uvr5_weights/2_HP-UVR.pth'
pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True)
audio_path = '神女劈观.aac'
save_path = 'opt'
pre_fun._path_audio_(audio_path , save_path,save_path)

151
slicer.py Normal file
View File

@ -0,0 +1,151 @@
import os.path
from argparse import ArgumentParser
import time
import librosa
import numpy as np
import soundfile
from scipy.ndimage import maximum_filter1d, uniform_filter1d
def timeit(func):
def run(*args, **kwargs):
t = time.time()
res = func(*args, **kwargs)
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
return res
return run
# @timeit
def _window_maximum(arr, win_sz):
return maximum_filter1d(arr, size=win_sz)[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
# @timeit
def _window_rms(arr, win_sz):
filtered = np.sqrt(uniform_filter1d(np.power(arr, 2), win_sz) - np.power(uniform_filter1d(arr, win_sz), 2))
return filtered[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
def level2db(levels, eps=1e-12):
return 20 * np.log10(np.clip(levels, a_min=eps, a_max=1))
def _apply_slice(audio, begin, end):
if len(audio.shape) > 1:
return audio[:, begin: end]
else:
return audio[begin: end]
class Slicer:
def __init__(self,
sr: int,
db_threshold: float = -40,
min_length: int = 5000,
win_l: int = 300,
win_s: int = 20,
max_silence_kept: int = 500):
self.db_threshold = db_threshold
self.min_samples = round(sr * min_length / 1000)
self.win_ln = round(sr * win_l / 1000)
self.win_sn = round(sr * win_s / 1000)
self.max_silence = round(sr * max_silence_kept / 1000)
if not self.min_samples >= self.win_ln >= self.win_sn:
raise ValueError('The following condition must be satisfied: min_length >= win_l >= win_s')
if not self.max_silence >= self.win_sn:
raise ValueError('The following condition must be satisfied: max_silence_kept >= win_s')
@timeit
def slice(self, audio):
if len(audio.shape) > 1:
samples = librosa.to_mono(audio)
else:
samples = audio
if samples.shape[0] <= self.min_samples:
return [audio]
# get absolute amplitudes
abs_amp = np.abs(samples - np.mean(samples))
# calculate local maximum with large window
win_max_db = level2db(_window_maximum(abs_amp, win_sz=self.win_ln))
sil_tags = []
left = right = 0
while right < win_max_db.shape[0]:
if win_max_db[right] < self.db_threshold:
right += 1
elif left == right:
left += 1
right += 1
else:
if left == 0:
split_loc_l = left
else:
sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
split_win_l = left + np.argmin(rms_db_left)
split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
if len(sil_tags) != 0 and split_loc_l - sil_tags[-1][1] < self.min_samples and right < win_max_db.shape[0] - 1:
right += 1
left = right
continue
if right == win_max_db.shape[0] - 1:
split_loc_r = right + self.win_ln
else:
sil_right_n = min(self.max_silence, (right + self.win_ln - left) // 2)
rms_db_right = level2db(_window_rms(samples[right + self.win_ln - sil_right_n: right + self.win_ln], win_sz=self.win_sn))
split_win_r = right + self.win_ln - sil_right_n + np.argmin(rms_db_right)
split_loc_r = split_win_r + np.argmin(abs_amp[split_win_r: split_win_r + self.win_sn])
sil_tags.append((split_loc_l, split_loc_r))
right += 1
left = right
if left != right:
sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
split_win_l = left + np.argmin(rms_db_left)
split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
sil_tags.append((split_loc_l, samples.shape[0]))
if len(sil_tags) == 0:
return [audio]
else:
chunks = []
if sil_tags[0][0] > 0:
chunks.append(_apply_slice(audio, 0, sil_tags[0][0]))
for i in range(0, len(sil_tags) - 1):
chunks.append(_apply_slice(audio, sil_tags[i][1], sil_tags[i + 1][0]))
if sil_tags[-1][1] < samples.shape[0] - 1:
chunks.append(_apply_slice(audio, sil_tags[-1][1], samples.shape[0]))
return chunks
def main():
parser = ArgumentParser()
parser.add_argument('audio', type=str, help='The audio to be sliced')
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
parser.add_argument('--db_thresh', type=float, required=False, default=-40, help='The dB threshold for silence detection')
parser.add_argument('--min_len', type=int, required=False, default=5000, help='The minimum milliseconds required for each sliced audio clip')
parser.add_argument('--win_l', type=int, required=False, default=300, help='Size of the large sliding window, presented in milliseconds')
parser.add_argument('--win_s', type=int, required=False, default=20, help='Size of the small sliding window, presented in milliseconds')
parser.add_argument('--max_sil_kept', type=int, required=False, default=500, help='The maximum silence length kept around the sliced audio, presented in milliseconds')
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None)
slicer = Slicer(
sr=sr,
db_threshold=args.db_thresh,
min_length=args.min_len,
win_l=args.win_l,
win_s=args.win_s,
max_silence_kept=args.max_sil_kept
)
chunks = slicer.slice(audio)
if not os.path.exists(args.out):
os.makedirs(args.out)
for i, chunk in enumerate(chunks):
soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,63 @@
import numpy as np,ffmpeg,os,traceback
from slicer import Slicer
slicer = Slicer(
sr=40000,
db_threshold=-32,
min_length=800,
win_l=400,
win_s=20,
max_silence_kept=150
)
def p0_load_audio(file, sr):#str-ing
try:
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
def p1_trim_audio(slicer,audio):return slicer.slice(audio)
def p2_avg_cut(audio,sr,per=3.7,overlap=0.3,tail=4):
i = 0
audios=[]
while (1):
start = int(sr * (per - overlap) * i)
i += 1
if (len(audio[start:]) > tail * sr):
audios.append(audio[start:start + int(per * sr)])
else:
audios.append(audio[start:])
break
return audios
def p2b_get_vol(audio):return np.square(audio).mean()
def p3_norm(audio,alpha=0.8,maxx=0.95):return audio / np.abs(audio).max() * (maxx * alpha) + (1-alpha) * audio
def pipeline(inp_root,sr1=40000,sr2=16000,if_trim=True,if_avg_cut=True,if_norm=True,save_root1=None,save_root2=None):
if(save_root1==None and save_root2==None):return "No save root."
name2vol={}
infos=[]
names=[]
for name in os.listdir(inp_root):
try:
inp_path=os.path.join(inp_root,name)
audio=p0_load_audio(inp_path)
except:
infos.append("%s\t%s"%(name,traceback.format_exc()))
continue
if(if_trim==True):res1s=p1_trim_audio(audio)
else:res1s=[audio]
for i0,res1 in res1s:
if(if_avg_cut==True):res2=p2_avg_cut(res1)
else:res2=[res1]

Binary file not shown.

Binary file not shown.

170
uvr5_pack/lib_v5/dataset.py Normal file
View File

@ -0,0 +1,170 @@
import os
import random
import numpy as np
import torch
import torch.utils.data
from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list):
self.patch_list = patch_list
def __len__(self):
return len(self.patch_list)
def __getitem__(self, idx):
path = self.patch_list[idx]
data = np.load(path)
X, y = data['X'], data['y']
X_mag = np.abs(X)
y_mag = np.abs(y)
return X_mag, y_mag
def make_pair(mix_dir, inst_dir):
input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
X_list = sorted([
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts])
y_list = sorted([
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts])
filelist = list(zip(X_list, y_list))
return filelist
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == 'random':
filelist = make_pair(
os.path.join(dataset_dir, 'mixtures'),
os.path.join(dataset_dir, 'instruments'))
random.shuffle(filelist)
if len(val_filelist) == 0:
val_size = int(len(filelist) * val_rate)
train_filelist = filelist[:-val_size]
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist
if list(pair) not in val_filelist]
elif split_mode == 'subdirs':
if len(val_filelist) != 0:
raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
train_filelist = make_pair(
os.path.join(dataset_dir, 'training/mixtures'),
os.path.join(dataset_dir, 'training/instruments'))
val_filelist = make_pair(
os.path.join(dataset_dir, 'validation/mixtures'),
os.path.join(dataset_dir, 'validation/instruments'))
return train_filelist, val_filelist
def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
if np.random.uniform() < 0.5:
# swap channel
X[idx] = X[idx, ::-1]
y[idx] = y[idx, ::-1]
if np.random.uniform() < 0.02:
# mono
X[idx] = X[idx].mean(axis=0, keepdims=True)
y[idx] = y[idx].mean(axis=0, keepdims=True)
if np.random.uniform() < 0.02:
# inst
X[idx] = y[idx]
if np.random.uniform() < mixup_rate and i < len(perm) - 1:
lam = np.random.beta(mixup_alpha, mixup_alpha)
X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
return X, y
def make_padding(width, cropsize, offset):
left = offset
roi_size = cropsize - left * 2
if roi_size == 0:
roi_size = cropsize
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist)
X_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize
for j in range(patches):
idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]]
return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
basename = os.path.splitext(os.path.basename(X_path))[0]
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
start = j * roi_size
if not os.path.exists(outpath):
np.savez(
outpath,
X=X_pad[:, :, start:start + cropsize],
y=y_pad[:, :, start:start + cropsize])
patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list)

116
uvr5_pack/lib_v5/layers.py Normal file
View File

@ -0,0 +1,116 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,116 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,116 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,122 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,122 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,122 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,60 @@
import json
import os
import pathlib
default_param = {}
default_param['bins'] = 768
default_param['unstable_bins'] = 9 # training only
default_param['reduction_bins'] = 762 # training only
default_param['sr'] = 44100
default_param['pre_filter_start'] = 757
default_param['pre_filter_stop'] = 768
default_param['band'] = {}
default_param['band'][1] = {
'sr': 11025,
'hl': 128,
'n_fft': 960,
'crop_start': 0,
'crop_stop': 245,
'lpf_start': 61, # inference only
'res_type': 'polyphase'
}
default_param['band'][2] = {
'sr': 44100,
'hl': 512,
'n_fft': 1536,
'crop_start': 24,
'crop_stop': 547,
'hpf_start': 81, # inference only
'res_type': 'sinc_best'
}
def int_keys(d):
r = {}
for k, v in d:
if k.isdigit():
k = int(k)
r[k] = v
return r
class ModelParameters(object):
def __init__(self, config_path=''):
if '.pth' == pathlib.Path(config_path).suffix:
import zipfile
with zipfile.ZipFile(config_path, 'r') as zip:
self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
elif '.json' == pathlib.Path(config_path).suffix:
with open(config_path, 'r') as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param
for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
if not k in self.param:
self.param[k] = False

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 16000,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 16000,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 32000,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "kaiser_fast"
}
},
"sr": 32000,
"pre_filter_start": 1000,
"pre_filter_stop": 1021
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 33075,
"hl": 384,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 33075,
"pre_filter_start": 1000,
"pre_filter_stop": 1021
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 1024,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

View File

@ -0,0 +1,19 @@
{
"bins": 256,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 256,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 256,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 256,
"pre_filter_stop": 256
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 700,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 700
}

View File

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 118,
"res_type": "sinc_fastest"
},
"2": {
"sr": 32000,
"hl": 352,
"n_fft": 1024,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 44,
"hpf_stop": 23,
"res_type": "sinc_medium"
}
},
"sr": 32000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

View File

@ -0,0 +1,30 @@
{
"bins": 512,
"unstable_bins": 7,
"reduction_bins": 510,
"band": {
"1": {
"sr": 11025,
"hl": 160,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 192,
"lpf_start": 41,
"lpf_stop": 139,
"res_type": "sinc_fastest"
},
"2": {
"sr": 44100,
"hl": 640,
"n_fft": 1024,
"crop_start": 10,
"crop_stop": 320,
"hpf_start": 47,
"hpf_stop": 15,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 510,
"pre_filter_stop": 512
}

View File

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 240,
"res_type": "sinc_fastest"
},
"2": {
"sr": 48000,
"hl": 528,
"n_fft": 1536,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 82,
"hpf_stop": 22,
"res_type": "sinc_medium"
}
},
"sr": 48000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

View File

@ -0,0 +1,42 @@
{
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,43 @@
{
"mid_side": true,
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 640,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 187,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 212,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 174,
"lpf_stop": 209,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 640,
"crop_start": 66,
"crop_stop": 307,
"hpf_start": 86,
"hpf_stop": 72,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 639,
"pre_filter_stop": 640
}

View File

@ -0,0 +1,54 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"bins": 768,
"unstable_bins": 7,
"mid_side": true,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"reverse": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"stereo_w": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,54 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

View File

@ -0,0 +1,55 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"convert_channels": "stereo_n",
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

View File

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 1280,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 374,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 1536,
"crop_start": 0,
"crop_stop": 424,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 348,
"lpf_stop": 418,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 1280,
"crop_start": 132,
"crop_stop": 614,
"hpf_start": 172,
"hpf_stop": 144,
"res_type": "polyphase"
}
},
"sr": 44100,
"pre_filter_start": 1280,
"pre_filter_stop": 1280
}

113
uvr5_pack/lib_v5/nets.py Normal file
View File

@ -0,0 +1,113 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers
from uvr5_pack.lib_v5 import spec_utils
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
self.stg1_high_band_net = BaseASPPNet(2, 16)
self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(8, 16)
self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(16, 32)
self.out = nn.Conv2d(32, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_33966KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
self.stg1_high_band_net = BaseASPPNet(2, 16)
self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(8, 16)
self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(16, 32)
self.out = nn.Conv2d(32, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,113 @@
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
self.stg1_high_band_net = BaseASPPNet(2, 64)
self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(32, 64)
self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(64, 128)
self.out = nn.Conv2d(128, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,113 @@
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
self.stg1_high_band_net = BaseASPPNet(2, 64)
self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(32, 64)
self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(64, 128)
self.out = nn.Conv2d(128, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

View File

@ -0,0 +1,485 @@
import os,librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
import json,math ,hashlib
def crop_center(h1, h2):
h1_shape = h1.size()
h2_shape = h2.size()
if h1_shape[3] == h2_shape[3]:
return h1
elif h1_shape[3] < h2_shape[3]:
raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
s_time = (h1_shape[3] - h2_shape[3]) // 2
e_time = s_time + h2_shape[3]
h1 = h1[:, :, :, s_time:e_time]
return h1
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
import threading
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
def run_thread(**kwargs):
global spec_left
spec_left = librosa.stft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join()
spec = np.asfortranarray([spec_left, spec_right])
return spec
def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
offset = 0
bands_n = len(mp.param['band'])
for d in range(1, bands_n + 1):
h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
offset += h
if offset > mp.param['bins']:
raise ValueError('Too much bins')
# lowpass fiter
if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
else:
gp = 1
for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
gp = g
spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode='magnitude'):
if mode == 'magnitude':
if np.iscomplexobj(spec):
y = np.abs(spec)
else:
y = spec
y = np.log10(y ** 2 + 1e-8)
elif mode == 'phase':
if np.iscomplexobj(spec):
y = np.angle(spec)
else:
y = spec
y -= y.min()
y *= 255 / y.max()
img = np.uint8(y)
if y.ndim == 3:
img = img.transpose(1, 2, 0)
img = np.concatenate([
np.max(img, axis=2, keepdims=True), img
], axis=2)
return img
def reduce_vocal_aggressively(X, y, softmask):
v = X - y
y_mag_tmp = np.abs(y)
v_mag_tmp = np.abs(v)
v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2:
raise ValueError('min_range must be >= fade_area * 2')
mag = mag.copy()
idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
uninformative = np.where(ends - starts > min_range)[0]
if len(uninformative) > 0:
starts = starts[uninformative]
ends = ends[uninformative]
old_e = None
for s, e in zip(starts, ends):
if old_e is not None and s - old_e < fade_size:
s = old_e - fade_size * 2
if s != 0:
weight = np.linspace(0, 1, fade_size)
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
else:
s -= fade_size
if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
else:
e += fade_size
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
old_e = e
return mag
def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size])
return a[:l,:l], b[:l,:l]
def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
mix_cache_dir = os.path.join('cache', cache_dir)
inst_cache_dir = os.path.join('cache', cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path)
y_spec_m = np.load(inst_cache_path)
else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
X_wave[d], _ = librosa.load(
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
y_wave[d], _ = librosa.load(
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
else: # lower bands
X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
del X_wave, y_wave
X_spec_m = combine_spectrograms(X_spec_s, mp)
y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape:
raise ValueError('The combined spectrograms are different: ' + mix_path)
_, ext = os.path.splitext(mix_path)
np.save(mix_cache_path, X_spec_m)
np.save(inst_cache_path, y_spec_m)
return X_spec_m, y_spec_m
def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
wave_left = librosa.istft(spec_left, hop_length=hop_length)
wave_right = librosa.istft(spec_right, hop_length=hop_length)
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
else:
return np.asfortranarray([wave_left, wave_right])
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
import threading
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
def run_thread(**kwargs):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
else:
return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {}
bands_n = len(mp.param['band'])
offset = 0
for d in range(1, bands_n + 1):
bp = mp.param['band'][d]
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
h = bp['crop_stop'] - bp['crop_start']
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp['n_fft'] // 2
spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
if bp['hpf_start'] > 0:
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
if bands_n == 1:
wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
else:
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
else:
sr = mp.param['band'][d+1]['sr']
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
else: # mid
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy')
return wave.T
def fft_lp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop):
g -= 1 / (bin_stop - bin_start)
spec[:, b, :] = g * spec[:, b, :]
spec[:, bin_stop:, :] *= 0
return spec
def fft_hp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop, -1):
g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :]
spec[:, 0:bin_stop+1, :] *= 0
return spec
def mirroring(a, spec_m, input_high_end, mp):
if 'mirroring' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mirror = mirror * np.exp(1.j * np.angle(input_high_end))
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
if 'mirroring2' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
def ensembling(a, specs):
for i in range(1, len(specs)):
if i == 1:
spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:,:,:ln]
specs[i] = specs[i][:,:,:ln]
if 'min_mag' == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if 'max_mag' == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec
def stft(wave, nfft, hl):
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def istft(spec, hl):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
wave_left = librosa.istft(spec_left, hop_length=hl)
wave_right = librosa.istft(spec_right, hop_length=hl)
wave = np.asfortranarray([wave_left, wave_right])
if __name__ == "__main__":
import cv2
import sys
import time
import argparse
from model_param_init import ModelParameters
p = argparse.ArgumentParser()
p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
p.add_argument('--output_name', '-o', type=str, default='output')
p.add_argument('--vocals_only', '-v', action='store_true')
p.add_argument('input', nargs='+')
args = p.parse_args()
start_time = time.time()
if args.algorithm.startswith('invert') and len(args.input) != 2:
raise ValueError('There should be two input files.')
if not args.algorithm.startswith('invert') and len(args.input) < 2:
raise ValueError('There must be at least two input files.')
wave, specs = {}, {}
mp = ModelParameters(args.model_params)
for i in range(len(args.input)):
spec = {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
wave[d], _ = librosa.load(
args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]])
else: # lower bands
wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
specs[i] = combine_spectrograms(spec, mp)
del wave
if args.algorithm == 'deep':
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1]
sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
if args.algorithm.startswith('invert'):
ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:,:,:ln]
specs[1] = specs[1][:,:,:ln]
if 'invert_p' == args.algorithm:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1]
if not args.vocals_only:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
v_mag = np.abs(v_spec)
X_image = spectrogram_to_image(X_mag)
y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag)
cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
cv2.imwrite('{}_v.png'.format(args.output_name), v_image)
sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
else:
if not args.algorithm == 'deep':
sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
if args.algorithm == 'align':
trackalignment = [
{
'file1':'"{}"'.format(args.input[0]),
'file2':'"{}"'.format(args.input[1])
}
]
for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
#print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))

242
uvr5_pack/utils.py Normal file
View File

@ -0,0 +1,242 @@
import torch
import numpy as np
from tqdm import tqdm
def make_padding(width, cropsize, offset):
left = offset
roi_size = cropsize - left * 2
if roi_size == 0:
roi_size = cropsize
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def inference(X_spec, device, model, aggressiveness,data):
'''
data dic configs
'''
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True):
model.eval()
with torch.no_grad():
preds = []
iterations = [n_window]
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
start = i * roi_size
X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
X_mag_window = torch.from_numpy(X_mag_window)
if(is_half==True):X_mag_window=X_mag_window.half()
X_mag_window=X_mag_window.to(device)
pred = model.predict(X_mag_window, aggressiveness)
pred = pred.detach().cpu().numpy()
preds.append(pred[0])
pred = np.concatenate(preds, axis=2)
return pred
def preprocess(X_spec):
X_mag = np.abs(X_spec)
X_phase = np.angle(X_spec)
return X_mag, X_phase
X_mag, X_phase = preprocess(X_spec)
coef = X_mag.max()
X_mag_pre = X_mag / coef
n_frame = X_mag_pre.shape[2]
pad_l, pad_r, roi_size = make_padding(n_frame,
data['window_size'], model.offset)
n_window = int(np.ceil(n_frame / roi_size))
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True
else:is_half=False
pred = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
pred = pred[:, :, :n_frame]
if data['tta']:
pad_l += roi_size // 2
pad_r += roi_size // 2
n_window += 1
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
pred_tta = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
pred_tta = pred_tta[:, :, roi_size // 2:]
pred_tta = pred_tta[:, :, :n_frame]
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
else:
return pred * coef, X_mag, np.exp(1.j * X_phase)
def _get_name_params(model_path , model_hash):
ModelName = model_path
if model_hash == '47939caf0cfe52a0e81442b85b971dfd':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'a82f14e75892e55e994376edbf0c8435':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '08611fb99bd59eaa79ad27c58d137727':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '5c7bbca45a187e81abbbd351606164e5':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'd6b2cb685a058a091e5e7098192d3233':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '68aa2c8093d0080704b200d140f59e54':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100.json')
if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '52fdca89576f06cf4340b74a4730ee5f':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '41191165b05d38fc77f072fa9e8e8a30':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '89e83b511ad474592689e562d5b1f80e':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
if model_hash == '0b954da81d453b716b114d6d7c95177f':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
#v4 Models
if model_hash == '6a00461c51c2920fd68937d4609ed6c8':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if model_hash == '0ab504864d20f1bd378fe9c81ef37140':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '80ab74d65e515caa3622728d2de07d23':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == 'edc115e7fc523245062200c00caa847f':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == 'ae702fed0238afb5346db8356fe25f13':
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#User Models
#1 Band
if '1band_sr16000_hl512' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if '1band_sr32000_hl512' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if '1band_sr33075_hl384' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if '1band_sr44100_hl256' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
param_name_auto=str('1band_sr44100_hl256')
if '1band_sr44100_hl512' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if '1band_sr44100_hl1024' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#2 Band
if '2band_44100_lofi' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
param_name_auto=str('2band_44100_lofi')
if '2band_32000' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000')
if '2band_48000' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/2band_48000.json')
param_name_auto=str('2band_48000')
#3 Band
if '3band_44100' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100')
if '3band_44100_mid' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid')
if '3band_44100_msb2' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
#4 Band
if '4band_44100' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if '4band_44100_mid' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
param_name_auto=str('4band_44100_mid')
if '4band_44100_msb' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
param_name_auto=str('4band_44100_msb')
if '4band_44100_msb2' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
param_name_auto=str('4band_44100_msb2')
if '4band_44100_reverse' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
param_name_auto=str('4band_44100_reverse')
if '4band_44100_sw' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json')
param_name_auto=str('4band_44100_sw')
if '4band_v2' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if '4band_v2_sn' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if 'tmodelparam' in ModelName:
model_params_auto=str('runtime/Lib/site-packages/uvr5_pack/lib_v5/modelparams/tmodelparam.json')
param_name_auto=str('User Model Param Set')
return param_name_auto , model_params_auto

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee
size 63454827

225
vc_infer_pipeline.py Normal file
View File

@ -0,0 +1,225 @@
import numpy as np,parselmouth,torch,pdb
from time import time as ttime
import torch.nn.functional as F
from config import x_pad,x_query,x_center,x_max
from sklearn.cluster import KMeans
def resize2d(x, target_len,is1):
minn=1 if is1==True else 0
ss = np.array(x).astype("float32")
ss[ss <=minn] = np.nan
target = np.interp(np.arange(0, len(ss) * target_len, len(ss)) / target_len, np.arange(0, len(ss)), ss)
res = np.nan_to_num(target)
return res
class VC(object):
def __init__(self,tgt_sr,device,is_half):
self.sr=16000#hubert输入采样率
self.window=160#每帧点数
self.t_pad=self.sr*x_pad#每条前后pad时间
self.t_pad_tgt=tgt_sr*x_pad
self.t_pad2=self.t_pad*2
self.t_query=self.sr*x_query#查询切点前后查询时间
self.t_center=self.sr*x_center#查询切点位置
self.t_max=self.sr*x_max#免查询时长阈值
self.device=device
self.is_half=is_half
def get_f0(self,x, p_len,f0_up_key=0,inp_f0=None):
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0=self.sr//self.window#每秒f0点数
if (inp_f0 is not None):
delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16")
replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1])
shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0]
f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak#1-0
def vc(self,model,net_g,dv,audio0,pitch,pitchf,times):
feats = torch.from_numpy(audio0)
if(self.is_half==True):feats=feats.half()
else:feats=feats.float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 9, # layer 9
}
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = ttime()
p_len = audio0.shape[0]//self.window
if(feats.shape[1]<p_len):
p_len=feats.shape[1]
pitch=pitch[:,:p_len]
pitchf=pitchf[:,:p_len]
p_len=torch.LongTensor([p_len]).to(self.device)
with torch.no_grad():
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, dv)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
del feats,p_len,padding_mask
torch.cuda.empty_cache()
t2 = ttime()
times[0] += (t1 - t0)
times[2] += (t2 - t1)
return audio1
def vc_km(self,model,net_g,dv,audio0,pitch,pitchf,times):
kmeans = KMeans(500)
def get_cluster_result(x):
"""x: np.array [t, 256]"""
return kmeans.predict(x)
checkpoint = torch.load("lulu_contentvec_kmeans_500.pt")
kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"]
feats = torch.from_numpy(audio0).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 9, # layer 9
}
torch.cuda.synchronize()
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
feats = get_cluster_result(feats.cpu().numpy()[0].astype("float32"))
feats = torch.from_numpy(feats).to(self.device)
feats = F.interpolate(feats.half().unsqueeze(0).unsqueeze(0), scale_factor=2).long().squeeze(0)
t1 = ttime()
p_len = audio0.shape[0]//self.window
if(feats.shape[1]<p_len):
p_len=feats.shape[1]
pitch=pitch[:,:p_len]
pitchf=pitchf[:,:p_len]
p_len=torch.LongTensor([p_len]).to(self.device)
with torch.no_grad():
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, dv)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
del feats,p_len,padding_mask
torch.cuda.empty_cache()
t2 = ttime()
times[0] += (t1 - t0)
times[2] += (t2 - t1)
return audio1
def pipeline(self,model,net_g,dv,audio,times,f0_up_key,f0_file=None):
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
opt_ts = []
if(audio_pad.shape[0]>self.t_max):
audio_sum = np.zeros_like(audio)
for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
s = 0
audio_opt=[]
t=None
t1=ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
p_len=audio_pad.shape[0]//self.window
inp_f0=None
if(hasattr(f0_file,'name') ==True):
try:
with open(f0_file.name,"r")as f:
lines=f.read().strip("\n").split("\n")
inp_f0=[]
for line in lines:inp_f0.append([float(i)for i in line.split(",")])
inp_f0=np.array(inp_f0,dtype="float32")
except:
traceback.print_exc()
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,inp_f0)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
# if(inp_f0 is None):
# pitch = pitch[:p_len]
# pitchf = pitchf[:p_len]
# else:
# pitch=resize2d(pitch,p_len,is1=True)
# pitchf=resize2d(pitchf,p_len,is1=False)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(self.device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(self.device)
t2=ttime()
times[1] += (t2 - t1)
for t in opt_ts:
t=t//self.window*self.window
audio_opt.append(self.vc(model,net_g,dv,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times)[self.t_pad_tgt:-self.t_pad_tgt])
s = t
audio_opt.append(self.vc(model,net_g,dv,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times)[self.t_pad_tgt:-self.t_pad_tgt])
audio_opt=np.concatenate(audio_opt)
del pitch,pitchf
return audio_opt
def pipeline_km(self,model,net_g,dv,audio,times,f0_up_key,f0_file=None):
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
opt_ts = []
if(audio_pad.shape[0]>self.t_max):
audio_sum = np.zeros_like(audio)
for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
s = 0
audio_opt=[]
t=None
t1=ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
p_len=audio_pad.shape[0]//self.window
inp_f0=None
if(hasattr(f0_file,'name') ==True):
try:
with open(f0_file.name,"r")as f:
lines=f.read().strip("\n").split("\n")
inp_f0=[]
for line in lines:inp_f0.append([float(i)for i in line.split(",")])
inp_f0=np.array(inp_f0,dtype="float32")
except:
traceback.print_exc()
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,inp_f0)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
# if(inp_f0 is None):
# pitch = pitch[:p_len]
# pitchf = pitchf[:p_len]
# else:
# pitch=resize2d(pitch,p_len,is1=True)
# pitchf=resize2d(pitchf,p_len,is1=False)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(self.device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(self.device)
t2=ttime()
times[1] += (t2 - t1)
for t in opt_ts:
t=t//self.window*self.window
audio_opt.append(self.vc_km(model,net_g,dv,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times)[self.t_pad_tgt:-self.t_pad_tgt])
s = t
audio_opt.append(self.vc_km(model,net_g,dv,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times)[self.t_pad_tgt:-self.t_pad_tgt])
audio_opt=np.concatenate(audio_opt)
del pitch,pitchf
return audio_opt

3
weights/白菜357k.pt Normal file
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d309e8056dff08d33b30854839a9b9c36dfb612bf5971c070f552bde18158a55
size 72645217

View File

@ -0,0 +1,54 @@
MIT License
Copyright (c) 2022 lj1995
本软件仅供研究使用,使用软件者、传播软件导出的声音者自负全责。如不认可该条款,则不能使用/引用软件包内所有代码和文件。
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
#################
ContentVec
https://github.com/auspicious3000/contentvec/blob/main/LICENSE
MIT License
#################
VITS
https://github.com/jaywalnut310/vits/blob/main/LICENSE
MIT License
#################
HIFIGAN
https://github.com/jik876/hifi-gan/blob/master/LICENSE
MIT License
#################
gradio
https://github.com/gradio-app/gradio/blob/main/LICENSE
Apache License 2.0
#################
ffmpeg
https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3
https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2021-02-28-12-32/ffmpeg-n4.3.2-160-gfbb9368226-win64-lgpl-4.3.zip
LPGLv3 License
MIT License
#################
ultimatevocalremovergui
https://github.com/Anjok07/ultimatevocalremovergui/blob/master/LICENSE
https://github.com/yang123qwe/vocal_separation_by_uvr5
MIT License
#################
audio-slicer
https://github.com/openvpi/audio-slicer/blob/main/LICENSE
MIT License