from pathlib import Path import json from models.clip.embeddings import ManagedCLIPTextEmbeddings from models.clip.tokenizer import MultiCLIPTokenizer from safetensors import safe_open def load_config(filename): with open(filename, 'rt') as f: config = json.load(f) args = config["args"] if "base" in config: args = load_config(Path(filename).parent / config["base"]) | args return args def load_embeddings_from_dir(tokenizer: MultiCLIPTokenizer, embeddings: ManagedCLIPTextEmbeddings, embeddings_dir: Path): if not embeddings_dir.exists() or not embeddings_dir.is_dir(): return [] filenames = [filename for filename in embeddings_dir.iterdir() if filename.is_file()] tokens = [filename.stem for filename in filenames] new_ids: list[list[int]] = [] new_embeds = [] for filename in filenames: with safe_open(filename, framework="pt", device="cpu") as file: embed = file.get_tensor("embed") added = tokenizer.add_multi_tokens(filename.stem, embed.shape[0]) new_ids.append(added) new_embeds.append(embed) embeddings.resize(len(tokenizer)) for (new_id, embeds) in zip(new_ids, new_embeds): embeddings.add_embed(new_id, embeds) return tokens, new_ids