diff --git a/.gitignore b/.gitignore index 7e5fda4..3d79a6f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ data/** ./extensions/** venv/** *.pyc +.idea/** diff --git a/data/models/audioldm/.placeholder b/data/models/audioldm/.placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/data/models/rvc/.placeholder b/data/models/rvc/.placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/data/models/unclassified/.placeholder b/data/models/unclassified/.placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/default_models.json b/default_models.json new file mode 100644 index 0000000..9bbb13e --- /dev/null +++ b/default_models.json @@ -0,0 +1,310 @@ +{ + "suno/bark||text_2.pt": { + "model_name": null, + "model_type": "text-to-speech", + "single_file": true, + "single_file_name": "text_2.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "google-bert/bert-base-multilingual-cased": { + "model_name": null, + "model_type": null, + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": [ + "*.safetensors", + "*.json", + "*.txt" + ], + "ignore_patterns": null + }, + "suno/bark||coarse_2.pt": { + "model_name": null, + "model_type": "text-to-speech", + "single_file": true, + "single_file_name": "coarse_2.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "suno/bark||fine_2.pt": { + "model_name": null, + "model_type": "text-to-speech", + "single_file": true, + "single_file_name": "fine_2.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th||encodec_24khz-d7cc33bc.th": { + "model_name": null, + "model_type": "encodec", + "single_file": true, + "single_file_name": "encodec_24khz-d7cc33bc.th", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt||hubert_base_ls960.pt": { + "model_name": null, + "model_type": "hubert", + "single_file": true, + "single_file_name": "hubert_base_ls960.pt", + "save_file_name": "hubert.pt", + "allow_patterns": null, + "ignore_patterns": null + }, + "GitMylo/bark-voice-cloning||quantifier_hubert_base_ls960_14.pth": { + "model_name": null, + "model_type": "hubert", + "single_file": true, + "single_file_name": "quantifier_hubert_base_ls960_14.pth", + "save_file_name": "tokenizer.pth", + "allow_patterns": null, + "ignore_patterns": null + }, + "Hobis/bark-voice-cloning-polish-HuBERT-quantizer||polish-HuBERT-quantizer_8_epoch.pth": { + "model_name": null, + "model_type": "hubert", + "single_file": true, + "single_file_name": "polish-HuBERT-quantizer_8_epoch.pth", + "save_file_name": "tokenizer_pol.pth", + "allow_patterns": null, + "ignore_patterns": null + }, + "cvssp/audioldm": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": [ + "*.safetensors", + "*.json", + "*.txt" + ], + "ignore_patterns": null + }, + "cvssp/audioldm-s-full-v2": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": [ + "*.safetensors", + "*.json", + "*.txt" + ], + "ignore_patterns": null + }, + "cvssp/audioldm-m-full": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": [ + "*.safetensors", + "*.json", + "*.txt" + ], + "ignore_patterns": null + }, + "sanchit-gandhi/clap-htsat-unfused-m-full": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "cvssp/audioldm-l-full": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": [ + "*.safetensors", + "*.json", + "*.txt" + ], + "ignore_patterns": null + }, + "facebook/musicgen-small": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "facebook/musicgen-medium": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "facebook/musicgen-large": { + "model_name": null, + "model_type": "music-generation", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt||large-v3.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "large-v3.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt||large-v2.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "large-v2.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt||large-v1.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "large-v1.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt||medium.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "medium.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt||medium.en.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "medium.en.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt||small.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "small.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt||small.en.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "small.en.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt||base.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "base.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt||base.en.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "base.en.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt||tiny.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "tiny.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt||tiny.en.pt": { + "model_name": null, + "model_type": "whisper", + "single_file": true, + "single_file_name": "tiny.en.pt", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "sail-rvc/Rick_Sanchez_Lat_v2": { + "model_name": null, + "model_type": "rvc", + "single_file": false, + "single_file_name": null, + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "sail-rvc/Rick_Sanchez_Lat_v2||model.pth": { + "model_name": null, + "model_type": "rvc", + "single_file": true, + "single_file_name": "model.pth", + "save_file_name": "Rick_Sanchez_Lat_v2", + "allow_patterns": null, + "ignore_patterns": null + }, + "sail-rvc/Rick_Sanchez_C137_lat||model.pth": { + "model_name": null, + "model_type": "rvc", + "single_file": true, + "single_file_name": "model.pth", + "save_file_name": "Rick_Sanchez_C137_lat", + "allow_patterns": null, + "ignore_patterns": null + }, + "sail-rvc/georgewbush||model.pth": { + "model_name": null, + "model_type": "rvc", + "single_file": true, + "single_file_name": "model.pth", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + }, + "0x3e9/Trump_RVC||model.pth": { + "model_name": null, + "model_type": "rvc", + "single_file": true, + "single_file_name": "model.pth", + "save_file_name": null, + "allow_patterns": null, + "ignore_patterns": null + } +} \ No newline at end of file diff --git a/hubert/hubert_manager.py b/hubert/hubert_manager.py deleted file mode 100644 index 4c62ed7..0000000 --- a/hubert/hubert_manager.py +++ /dev/null @@ -1,46 +0,0 @@ -import os.path -import shutil -import urllib.request - -import huggingface_hub - - -class HuBERTManager: - @staticmethod - def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'): - install_dir = os.path.join('data', 'models', 'hubert') - if not os.path.isdir(install_dir): - os.makedirs(install_dir, exist_ok=True) - install_file = os.path.join(install_dir, file_name) - if not os.path.isfile(install_file): - print('Downloading HuBERT base model') - urllib.request.urlretrieve(download_url, install_file) - print('Downloaded HuBERT') - return install_file - - - @staticmethod - def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'): - install_dir = os.path.join('data', 'models', 'hubert') - if not os.path.isdir(install_dir): - os.makedirs(install_dir, exist_ok=True) - install_file = os.path.join(install_dir, local_file) - if not os.path.isfile(install_file): - print('Downloading HuBERT custom tokenizer') - huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False) - shutil.move(os.path.join(install_dir, model), install_file) - print('Downloaded tokenizer') - return install_file - - @staticmethod - def make_sure_hubert_rvc_installed(model: str = 'hubert_base.pt', repo: str = 'lj1995/VoiceConversionWebUI', local_file: str = 'hubert_rvc.pt'): - install_dir = os.path.join('data', 'models', 'hubert') - if not os.path.isdir(install_dir): - os.makedirs(install_dir, exist_ok=True) - install_file = os.path.join(install_dir, local_file) - if not os.path.isfile(install_file): - print('Downloading HuBERT for RVC') - huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False) - shutil.move(os.path.join(install_dir, model), install_file) - print('Downloaded HuBERT for RVC') - return install_file diff --git a/install.py b/install.py index 8a30cc4..6ada6b2 100644 --- a/install.py +++ b/install.py @@ -12,6 +12,9 @@ def ensure_installed(): ensure_venv() if not args.skip_install: install_requirements() + if args.download_models: + from webui.modules import model_manager + model_manager.download_all_models() if __name__ == '__main__': diff --git a/main.py b/main.py index 2b42c1e..d6a31d2 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ from webui.args import args # Will show help message if needed import os + # Set custom default huggingface download path if not args.no_data_cache: os.environ['XDG_CACHE_HOME'] = os.getenv('XDG_CACHE_HOME', os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'models', 'unclassified')) @@ -8,7 +9,8 @@ os.environ['HF_HUB_CACHE'] = os.getenv('HF_HUB_CACHE', os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'models', 'hf_cache')) # Experimental, due to some people being unable to install from this variable missing, set a default here. # Set custom gradio temp dir -os.environ['GRADIO_TEMP_DIR'] = os.getenv('GRADIO_TEMP_DIR', os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'temp')) +os.environ['GRADIO_TEMP_DIR'] = os.getenv('GRADIO_TEMP_DIR', + os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'temp')) from autodebug.prelaunch import prelaunch_checks from autodebug import autodebug @@ -20,22 +22,22 @@ print('Activating extensions') import webui.extensionlib.extensionmanager as em + for e in em.states.values(): e.activate() print('Preparing') from webui.modules.implementations.tts_monkeypatching import patch as patch1 + patch1() - # from webui.modules.implementations.gradio_monkeypatching import patch as patch2 - # patch2() - # from webui.modules.implementations.huggingface_hub_monkeypatching import patch as patch3 + patch3() import torch - print('Launching, cuda available:', torch.cuda.is_available()) + print('Launching, cuda available:', torch.cuda.is_available()) from webui.webui import launch_webui diff --git a/model_manager.py b/model_manager.py new file mode 100644 index 0000000..709499b --- /dev/null +++ b/model_manager.py @@ -0,0 +1,148 @@ +import json +import os +from typing import List, Union + +import requests +from huggingface_hub import hf_hub_download, snapshot_download +from tqdm import tqdm + + +def download_from_url(url: str, filename: str, local_dir: str) -> str: + """ + Download a file from a URL using TQDM to show the progress. + :param url: The URL to download the file from. + :param filename: The name of the file to save. + :param local_dir: The directory to save the file in. + :return: The path to the downloaded file. + """ + if not os.path.exists(local_dir): + os.makedirs(local_dir, exist_ok=True) + local_path = os.path.join(local_dir, filename) + + if not os.path.isfile(local_path): + response = requests.get(url, stream=True) + + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(local_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + + return local_path + + +def export_model_record(model_url: str, model_name: str, model_type: str, single_file: bool, single_file_name: str, + save_file_name: str, allow_patterns: Union[str, List[str]], ignore_patterns: Union[str, List[str]]): + calls_file_path = os.path.join(os.path.dirname(__file__), 'default_models.json') + try: + with open(calls_file_path, 'r') as calls_file: + model_calls = json.load(calls_file) + except Exception: + model_calls = {} + + # Check if the model_url is already logged, if not, log the call parameters + model_key = model_url + if single_file and single_file_name: + model_key = f"{model_key}||{single_file_name}" + if model_key not in model_calls: + model_calls[model_key] = { + 'model_name': model_name, + 'model_type': model_type, + 'single_file': single_file, + 'single_file_name': single_file_name, + 'save_file_name': save_file_name, + 'allow_patterns': allow_patterns, + 'ignore_patterns': ignore_patterns + } + with open(calls_file_path, 'w') as calls_file: + json.dump(model_calls, calls_file, indent=4) + + +def get_model_path( + model_url: str, + model_name: str = None, + model_type: str = None, + single_file: bool = False, + single_file_name: str = None, + save_file_name: str = None, + allow_patterns: Union[str, List[str]] = None, + ignore_patterns: Union[str, List[str]] = None) -> str: + """ + Get the model path from the model URL + :param model_url: The URL of the model on the HF Hub + :param model_name: The directory to store the model in the models folder, defaults to the model/creator name + :param model_type: The type of model to download - this will be inserted into the model path + :param single_file: Whether the model is a single file + :param single_file_name: The name of the single file to download + :param save_file_name: The name of the file to save + :param allow_patterns: The patterns to allow for file downloads + :param ignore_patterns: The patterns to ignore for file downloads + :return: The model path + """ + # This is for development purposes only, uncomment to log the model calls + # export_model_record(model_url, model_name, model_type, single_file, single_file_name, save_file_name, + # allow_patterns, ignore_patterns) + + if "http" in model_url: + model_dir = os.path.join(os.path.dirname(__file__), 'data', 'models') + if model_type is not None: + model_dir = os.path.join(model_dir, model_type) + if model_name is not None: + model_dir = os.path.join(model_dir, model_name) + else: + model_dev = model_url.split('/')[0] + model_name = model_name or model_url.split('/')[1] + if model_type is not None: + model_dir = os.path.join(os.path.dirname(__file__), 'data', 'models', model_type, model_dev, model_name) + else: + model_dir = os.path.join(os.path.dirname(__file__), 'data', 'models', model_dev, model_name) + if single_file and single_file_name: + model_path = os.path.join(model_dir, single_file_name) + if save_file_name: + model_path = os.path.join(model_dir, save_file_name) + do_download = not os.path.isfile(model_path) + else: + do_download = not os.path.exists(model_dir) + model_path = model_dir + + # If the model doesn't exist, us HF Hub to download it + if do_download: + try: + if single_file and single_file_name: + print(f"Downloading {single_file_name} from {model_url}") + if "http" in model_url: + dl_path = download_from_url(model_url, filename=single_file_name, local_dir=model_path) + else: + dl_path = hf_hub_download(model_url, filename=single_file_name, local_dir=model_path, + local_dir_use_symlinks=False) + if dl_path != model_path: + temp_name = os.path.join(model_dir, f"{single_file_name}.tmp") + os.rename(dl_path, temp_name) + # If the dirname of dl_path is empty, remove it + if not os.listdir(os.path.dirname(dl_path)): + os.rmdir(os.path.dirname(dl_path)) + os.rename(temp_name, model_path) + else: + print(f"Downloading model from {model_url}") + snapshot_download(model_url, local_dir=model_path, local_dir_use_symlinks=False, + allow_patterns=allow_patterns, ignore_patterns=ignore_patterns) + except Exception as e: + raise Exception(f"Failed to download model from {model_url}: {e}") + + return model_path + + +def download_all_models(): + calls_file_path = os.path.join(os.path.dirname(__file__), 'default_models.json') + with open(calls_file_path, 'r') as calls_file: + model_calls = json.load(calls_file) + + for model_url, model_params in model_calls.items(): + get_model_path(model_url, **model_params) \ No newline at end of file diff --git a/webui/args.py b/webui/args.py index ce1c34b..69f26fd 100644 --- a/webui/args.py +++ b/webui/args.py @@ -8,7 +8,7 @@ parser.add_argument('-sv', '--skip-venv', action='store_true', help='Skip creating/activating venv, also skips install (for advanced users)') parser.add_argument('--no-data-cache', action='store_true', help='Don\'t override the default huggingface_hub cache path.') parser.add_argument('-v', '--verbose', action='store_true', help='Show more info, like logs during installs') - +parser.add_argument('--download-models', action='store_true', help='Pre-Download all models from the hub') # Gradio parser.add_argument('-s', '--share', action='store_true', help='Share this gradio instance.') parser.add_argument('-u', '--username', '--user', type=str, help='Gradio username') diff --git a/webui/modules/download.py b/webui/modules/download.py index c85fc65..b66d9c4 100644 --- a/webui/modules/download.py +++ b/webui/modules/download.py @@ -4,7 +4,7 @@ import huggingface_hub import webui.modules.models as mod -model_types = ['text-to-speech', 'automatic-speech-recognition', 'audio-to-audio', 'rvc'] +model_types = ['text-to-speech', 'audio-to-audio', 'rvc'] class AutoModel: @@ -19,6 +19,7 @@ def __str__(self): def get_rvc_models(): path = os.path.join('data', 'models', 'rvc') output = [] + os.makedirs(path, exist_ok=True) for f in os.listdir(path): f_path = os.path.join(path, f) if os.path.isdir(f_path): @@ -42,13 +43,4 @@ def fill_models(model_type: str): def get_file_name(repo_id: str): - return repo_id.replace('/', '--') - - -def hub_download(repo_id: str, model_type: str): - try: - huggingface_hub.snapshot_download(repo_id, local_dir_use_symlinks=False, - local_dir=f'data/models/{model_type}/{get_file_name(repo_id)}') - except Exception as e: - return [f'

{str(e)}

', gradio.Dropdown.update()] - return [f"Successfully downloaded {repo_id}", mod.refresh_choices()] + return repo_id.replace('/', '--') \ No newline at end of file diff --git a/webui/modules/implementations/audiocraft.py b/webui/modules/implementations/audiocraft.py index 1ab0efc..0f6f751 100644 --- a/webui/modules/implementations/audiocraft.py +++ b/webui/modules/implementations/audiocraft.py @@ -5,6 +5,8 @@ from audiocraft.models import MusicGen from audiocraft.models import AudioGen +import model_manager + model: MusicGen = None loaded = False used_model = '' @@ -24,7 +26,8 @@ def create_model(pretrained='medium', map_device='cuda' if torch.cuda.is_availab delete_model() global model, loaded, device, used_model try: - model = MusicGen.get_pretrained(pretrained, device=map_device) if pretrained not in audiogen_models else AudioGen.get_pretrained(pretrained, device=map_device) + model_path = model_manager.get_model_path(pretrained, model_type="music-generation") + model = MusicGen.get_pretrained(model_path, device=map_device) if pretrained not in audiogen_models else AudioGen.get_pretrained(pretrained, device=map_device) device = map_device used_model = pretrained loaded = True diff --git a/webui/modules/implementations/audioldm.py b/webui/modules/implementations/audioldm.py index 157cd04..60e7c12 100644 --- a/webui/modules/implementations/audioldm.py +++ b/webui/modules/implementations/audioldm.py @@ -6,6 +6,8 @@ import transformers import librosa +import model_manager + model: diffusers.AudioLDMPipeline = None loaded = False clap_model: transformers.ClapModel = None @@ -20,10 +22,11 @@ def create_model(pretrained='cvssp/audioldm-m-full', map_device='cuda' if torch. delete_model() global model, loaded, clap_model, processor, device try: - cache_dir = os.path.join('data', 'models', 'audioldm') - model = diffusers.AudioLDMPipeline.from_pretrained(pretrained, cache_dir=cache_dir).to(map_device) - clap_model = transformers.ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", cache_dir=cache_dir).to(map_device) - processor = transformers.AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", cache_dir=cache_dir) + model_path = model_manager.get_model_path(pretrained, model_type="music-generation", allow_patterns=['*.safetensors', '*.json', '*.txt']) + model = diffusers.AudioLDMPipeline.from_pretrained(model_path).to(map_device) + clap_model_path = model_manager.get_model_path('sanchit-gandhi/clap-htsat-unfused-m-full', model_type="music-generation") + clap_model = transformers.ClapModel.from_pretrained(clap_model_path).to(map_device) + processor = transformers.AutoProcessor.from_pretrained(clap_model_path) device = map_device loaded = True except: diff --git a/webui/modules/implementations/audioldm2.py b/webui/modules/implementations/audioldm2.py index 155d5d2..2481c3d 100644 --- a/webui/modules/implementations/audioldm2.py +++ b/webui/modules/implementations/audioldm2.py @@ -6,6 +6,8 @@ import transformers import librosa +import model_manager + model: diffusers.AudioLDM2Pipeline = None loaded = False device: str = None @@ -18,8 +20,8 @@ def create_model(pretrained='cvssp/audioldm2', map_device='cuda' if torch.cuda.i delete_model() global model, loaded, device try: - cache_dir = os.path.join('data', 'models', 'audioldm') - model = diffusers.AudioLDM2Pipeline.from_pretrained(pretrained, cache_dir=cache_dir).to(map_device) + model_path = model_manager.get_model_path(pretrained, model_type="music-generation") + model = diffusers.AudioLDM2Pipeline.from_pretrained(model_path).to(map_device) device = map_device loaded = True except: diff --git a/webui/modules/implementations/gradio_monkeypatching.py b/webui/modules/implementations/gradio_monkeypatching.py deleted file mode 100644 index e0aba2b..0000000 --- a/webui/modules/implementations/gradio_monkeypatching.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import Literal, Callable - -import gradio -import numpy as np - - -class Audio(gradio.Audio): - def __init__( - self, - value: str | tuple[int, np.ndarray] | Callable | None = None, - *, - source: str = "upload", - type: str = "numpy", - label: str | None = None, - every: float | None = None, - show_label: bool = True, - container: bool = True, - scale: int | None = None, - min_width: int = 160, - interactive: bool | None = None, - visible: bool = True, - streaming: bool = False, - elem_id: str | None = None, - elem_classes: list[str] | str | None = None, - format: Literal["wav", "mp3"] = "wav", - autoplay: bool = False, - **kwargs, - ): - super().__init__(value, source=source, type=type, label=label, every=every, show_label=show_label, - container=container, scale=scale, min_width=min_width, interactive=interactive, - visible=visible, streaming=streaming, elem_id=elem_id, elem_classes=elem_classes, - format=format, autoplay=autoplay, **kwargs) - self.change(fn=lambda a: a, inputs=self, outputs=self) - - -def patch(): - print('Monkeypatching gradio') - gradio.Audio = Audio - diff --git a/webui/modules/implementations/patches/bark_custom_voices.py b/webui/modules/implementations/patches/bark_custom_voices.py index 1adc2d6..55cda4b 100644 --- a/webui/modules/implementations/patches/bark_custom_voices.py +++ b/webui/modules/implementations/patches/bark_custom_voices.py @@ -1,16 +1,18 @@ import torch import torchaudio -from bark.generation import SAMPLE_RATE, load_codec_model +from bark.generation import SAMPLE_RATE +from encodec.utils import convert_audio +import model_manager from hubert.customtokenizer import CustomTokenizer -from hubert.hubert_manager import HuBERTManager from hubert.pre_kmeans_hubert import CustomHubert -from webui.modules.implementations.patches.bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new -from encodec.utils import convert_audio +from webui.modules.implementations.patches.bark_generation import generate_text_semantic_new, generate_coarse_new, \ + generate_fine_new, load_codec_model from webui.ui.tabs import settings -def generate_semantic_fine(transcript='There actually isn\'t a way to do that. It\'s impossible. Please don\'t even bother.'): +def generate_semantic_fine( + transcript='There actually isn\'t a way to do that. It\'s impossible. Please don\'t even bother.'): """ Creates a speech file with semantics and fine audio :param transcript: The transcript. @@ -27,13 +29,18 @@ def generate_semantic_fine(transcript='There actually isn\'t a way to do that. I def load_hubert(clone_model): global huberts - hubert_path = HuBERTManager.make_sure_hubert_installed() - # model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if args.bark_cloning_large_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth') - tokenizer_path = HuBERTManager.make_sure_tokenizer_installed(model=clone_model['file'], local_file=clone_model['dlfilename'], repo=clone_model['repo']) + hubert_path = model_manager.get_model_path(model_url='https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', + model_type='hubert', single_file=True, + single_file_name='hubert_base_ls960.pt', save_file_name='hubert.pt') + + tokenizer_path = model_manager.get_model_path(model_url=clone_model['repo'], model_type='hubert', single_file=True, + single_file_name=clone_model['file'], + save_file_name=clone_model['dlfilename']) if 'hubert' not in huberts: print('Loading HuBERT') huberts['hubert'] = CustomHubert(hubert_path) - if 'tokenizer' not in huberts or ('tokenizer_name' in huberts and huberts['tokenizer_name'] != clone_model['name'].casefold()): + if 'tokenizer' not in huberts or ( + 'tokenizer_name' in huberts and huberts['tokenizer_name'] != clone_model['name'].casefold()): print('Loading Custom Tokenizer') tokenizer = CustomTokenizer.load_from_checkpoint(tokenizer_path, map_location=torch.device('cpu')) huberts['tokenizer'] = tokenizer @@ -78,7 +85,8 @@ def generate_course_history(fine_history): def generate_fine_from_wav(file): - model = load_codec_model(use_gpu=not settings.get('bark_use_cpu')) # Don't worry about reimporting, it stores the loaded model in a dict + model = load_codec_model( + use_gpu=not settings.get('bark_use_cpu')) # Don't worry about reimporting, it stores the loaded model in a dict wav, sr = torchaudio.load(file) wav = convert_audio(wav, sr, SAMPLE_RATE, model.channels) wav = wav.unsqueeze(0) @@ -91,4 +99,3 @@ def generate_fine_from_wav(file): codes = codes.cpu().numpy() return codes - diff --git a/webui/modules/implementations/patches/bark_generation.py b/webui/modules/implementations/patches/bark_generation.py index a52ff62..f0f6740 100644 --- a/webui/modules/implementations/patches/bark_generation.py +++ b/webui/modules/implementations/patches/bark_generation.py @@ -1,9 +1,11 @@ +from pathlib import Path from typing import Union import bark.generation as o import gradio from bark.generation import * +import model_manager from webui.ui.tabs import settings SUPPORTED_LANGS = [ @@ -558,9 +560,7 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"): raise NotImplementedError() model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type model_info = REMOTE_MODEL_PATHS[model_key] - if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") - o._download(model_info["repo_id"], model_info["file_name"]) + ckpt_path = model_manager.get_model_path(model_info["repo_id"], model_type="text-to-speech", single_file=True, single_file_name=model_info["file_name"]) checkpoint = torch.load(ckpt_path, map_location=device) # this is a hack model_args = checkpoint["model_args"] @@ -594,8 +594,9 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"): model.to(device) del checkpoint, state_dict o._clear_cuda_cache() + bert_path = model_manager.get_model_path("google-bert/bert-base-multilingual-cased", allow_patterns=['*.safetensors', '*.json', '*.txt']) if model_type == "text": - tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") + tokenizer = BertTokenizer.from_pretrained(bert_path) return { "model": model, "tokenizer": tokenizer, @@ -631,6 +632,19 @@ def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="te return models[model_key] +def encodec_load_codec_model(device): + model_base_url = "https://dl.fbaipublicfiles.com/encodec/v0/" + checkpoint_name = 'encodec_24khz-d7cc33bc.th' + model_file = model_manager.get_model_path(f"{model_base_url}{checkpoint_name}", model_type="encodec", single_file=True, single_file_name=checkpoint_name) + model_path_obj = Path(os.path.dirname(model_file)) + model = EncodecModel.encodec_model_24khz(repository=model_path_obj) + model.set_target_bandwidth(6.0) + model.eval() + model.to(device) + o._clear_cuda_cache() + return model + + def load_codec_model(use_gpu=True, force_reload=False): global models global models_devices @@ -644,7 +658,7 @@ def load_codec_model(use_gpu=True, force_reload=False): device = "cpu" if model_key not in models or force_reload: clean_models(model_key=model_key) - model = o._load_codec_model(device) + model = encodec_load_codec_model(device) models[model_key] = model models[model_key].to(device) return models[model_key] diff --git a/webui/modules/implementations/rvc/custom_pitch_extraction.py b/webui/modules/implementations/rvc/custom_pitch_extraction.py index d8e1a72..19f5122 100644 --- a/webui/modules/implementations/rvc/custom_pitch_extraction.py +++ b/webui/modules/implementations/rvc/custom_pitch_extraction.py @@ -9,6 +9,8 @@ from scipy import signal from torch import Tensor +import model_manager + def get_f0_crepe_computation( x, @@ -149,8 +151,7 @@ def pitch_extract(f0_method, x, f0_min, f0_max, p_len, time_step, sr, window, cr rmvpe_model_path = os.path.join('data', 'models', 'rmvpe') rmvpe_model_file = os.path.join(rmvpe_model_path, 'rmvpe.pt') if not os.path.isfile(rmvpe_model_file): - import huggingface_hub - rmvpe_model_file = huggingface_hub.hf_hub_download('lj1995/VoiceConversionWebUI', 'rmvpe.pt', local_dir=rmvpe_model_path, local_dir_use_symlinks=False) + rmvpe_model_file = model_manager.get_model_path('lj1995/VoiceConversionWebUI', 'rmvpe.pt', single_file=True) from webui.modules.implementations.rvc.rmvpe import RMVPE print("loading rmvpe model") diff --git a/webui/modules/implementations/rvc/rvc.py b/webui/modules/implementations/rvc/rvc.py index bdd0286..3b59116 100644 --- a/webui/modules/implementations/rvc/rvc.py +++ b/webui/modules/implementations/rvc/rvc.py @@ -7,24 +7,22 @@ import gc import os import traceback +from multiprocessing import cpu_count import ffmpeg import numpy as np -import torch.cuda -import argparse import torch -from multiprocessing import cpu_count +import torch.cuda from fairseq import checkpoint_utils -from hubert.hubert_manager import HuBERTManager -from webui.modules.implementations.rvc.vc_infer_pipeline import VC - +from model_manager import get_model_path from webui.modules.implementations.rvc.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) +from webui.modules.implementations.rvc.vc_infer_pipeline import VC hubert_model = None weight_root = os.path.join('data', 'models', 'rvc') @@ -124,8 +122,10 @@ def device_config(self) -> tuple: def load_hubert(): global hubert_model if not hubert_model: + hubert_path = get_model_path(model_url='lj1995/VoiceConversionWebUI', model_type='hubert', single_file=True, + single_file_name="hubert_base.pt", save_file_name='hubert_rvc.pt') models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - [HuBERTManager.make_sure_hubert_rvc_installed()], + [hubert_path], suffix="", ) hubert_model = models[0] diff --git a/webui/modules/implementations/whisper.py b/webui/modules/implementations/whisper.py index 8d55a8f..248d257 100644 --- a/webui/modules/implementations/whisper.py +++ b/webui/modules/implementations/whisper.py @@ -1,43 +1,24 @@ import gc import os.path from tempfile import NamedTemporaryFile +from typing import Union import torch import whisper from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutomaticSpeechRecognitionPipeline from gradio_client.client import DEFAULT_TEMP_DIR +import model_manager + processor: WhisperProcessor = None -model: WhisperForConditionalGeneration | AutomaticSpeechRecognitionPipeline = None +model: Union[WhisperForConditionalGeneration, AutomaticSpeechRecognitionPipeline] = None device: str = None loaded_model: str = None def get_official_models(): - # return [ - # 'openai/whisper-tiny.en', - # 'openai/whisper-small.en', - # 'openai/whisper-base.en', - # 'openai/whisper-medium.en', - # 'openai/whisper-tiny', - # 'openai/whisper-small', - # 'openai/whisper-base', - # 'openai/whisper-medium', - # 'openai/whisper-large', - # 'openai/whisper-large-v2' - # ] - return [ - 'tiny.en', - 'small.en', - 'base.en', - 'medium.en', - 'tiny', - 'small', - 'base', - 'medium', - 'large', - 'large-v2' - ] + models = whisper._MODELS + return models.keys() def unload(): @@ -56,10 +37,18 @@ def load(pretrained_model='openai/whisper-base', map_device='cuda' if torch.cuda try: if loaded_model != pretrained_model: unload() - # model = pipeline('automatic-speech-recognition', pretrained_model, device=map_device, model_kwargs={'cache_dir': 'models/automatic-speech-recognition'}) - model = whisper.load_model(pretrained_model, map_device, 'data/models/automatic-speech-recognition/whisper') - loaded_model = pretrained_model - device = map_device + print(f'Loading {pretrained_model}') + whisper_models = whisper._MODELS + official_models = get_official_models() + if pretrained_model in official_models: + model_url = whisper_models[pretrained_model] + model_name = os.path.basename(model_url) + model_path = model_manager.get_model_path(model_url, model_type="whisper", single_file=True, single_file_name=model_name) + model = whisper.load_model(model_path, map_device) + loaded_model = pretrained_model + device = map_device + else: + raise Exception(f'Model {pretrained_model} not found; available models = {get_official_models()}') return f'Loaded {pretrained_model}' except Exception as e: unload() diff --git a/webui/ui/tabs/rvc.py b/webui/ui/tabs/rvc.py index 87fcee5..f6fe404 100644 --- a/webui/ui/tabs/rvc.py +++ b/webui/ui/tabs/rvc.py @@ -3,6 +3,8 @@ import torch.cuda import torchaudio import gradio + +import model_manager from webui.modules import util from webui.modules.download import fill_models @@ -60,6 +62,10 @@ def load_rvc(model): if not model: return unload_rvc() import webui.modules.implementations.rvc.rvc as rvc + if not os.path.exists(model): + model_path = model_manager.get_model_path(model, model_type="rvc", single_file=True, single_file_name="model.pth") + if not os.path.exists(model_path): + return [gradio.update(), gradio.update(maximum=0, value=0, visible=False)] maximum = rvc.load_rvc(model) return [gradio.update(), gradio.update(maximum=maximum, value=0, visible=maximum > 0)] diff --git a/webui/ui/tabs/training/training/rvc_workspace.py b/webui/ui/tabs/training/training/rvc_workspace.py index 80b3e45..98cf05c 100644 --- a/webui/ui/tabs/training/training/rvc_workspace.py +++ b/webui/ui/tabs/training/training/rvc_workspace.py @@ -22,7 +22,7 @@ from scipy.io import wavfile from torch.utils.data import DataLoader -from hubert import hubert_manager +from model_manager import get_model_path from webui.modules.implementations.rvc import utils from webui.modules.implementations.rvc.data_utils import TextAudioLoaderMultiNSFsid, TextAudioLoader, \ DistributedBucketSampler, TextAudioCollateMultiNSFsid, TextAudioCollate, spec_to_mel_torch, mel_spectrogram_torch @@ -249,9 +249,10 @@ def pitch_extract(): output += '\nLoading HuBERT model...' yield output - + hubert_path = get_model_path(model_url='lj1995/VoiceConversionWebUI', model_type='hubert', single_file=True, + single_file_name="hubert_base.pt", save_file_name='hubert_rvc.pt') models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [hubert_manager.HuBERTManager.make_sure_hubert_rvc_installed()], + [hubert_path], suffix="", ) diff --git a/webui/webui.py b/webui/webui.py index 9faebd7..1561c05 100644 --- a/webui/webui.py +++ b/webui/webui.py @@ -6,6 +6,8 @@ def launch_webui(): auth = (args.username, args.password) if args.username else None + download_models = args.download_models + template_response_original = gradio.routes.templates.TemplateResponse # Magic monkeypatch