summaryrefslogtreecommitdiffstats
path: root/util/files.py
blob: 73ff802a4b23e2ca6640bcd78b4b4adac0277e98 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from pathlib import Path
import json

from models.clip.embeddings import ManagedCLIPTextEmbeddings
from models.clip.tokenizer import MultiCLIPTokenizer

from safetensors import safe_open


def load_config(filename):
    with open(filename, "rt") as f:
        config = json.load(f)

    args = config["args"]

    if "base" in config:
        args = load_config(Path(filename).parent / config["base"]) | args

    return args


def load_embeddings_from_dir(
    tokenizer: MultiCLIPTokenizer,
    embeddings: ManagedCLIPTextEmbeddings,
    embeddings_dir: Path,
):
    if not embeddings_dir.exists() or not embeddings_dir.is_dir():
        return [], []

    filenames = [
        filename for filename in embeddings_dir.iterdir() if filename.is_file()
    ]
    tokens = [filename.stem for filename in filenames]

    new_ids: list[list[int]] = []
    new_embeds = []

    for filename in filenames:
        with safe_open(filename, framework="pt", device="cpu") as file:
            embed = file.get_tensor("embed")

        added = tokenizer.add_multi_tokens(filename.stem, embed.shape[0])
        new_ids.append(added)
        new_embeds.append(embed)

    embeddings.resize(len(tokenizer))

    for new_id, embeds in zip(new_ids, new_embeds):
        embeddings.add_embed(new_id, embeds)

    return tokens, new_ids