# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156 import os import re import json import time import shutil import urllib.request config = { 'input': './in/', # files to process 'output': './out/', # files to copy files to 'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against) 'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running 'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second 'filenameLimit': 192, # maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser 'filter': True, # fill it with tags of whatever you don't want to make it into the filename # for starters, you can also add "anthro", "male", "female", as they're very common tags 'filters': [ "anthro", "fur", "male", "female", "genitals", "video games", "animal genitalia", ], 'filtersRegex': [ r"clothing$", ], 'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists # i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD) 'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model } with open(config['tags'], 'rb') as f: csv = f.read().decode('utf-8').split("\n") config['tags'] = {} for i in csv: k, v = i.split(',') config['tags'][k] = int(v) cache = {} try: with open(config['cache'], 'rb') as f: cache = json.loads(f.read().decode('utf-8')) except: pass def parse(): global config, cache files = [] for file in os.listdir(config['input']): files.append(file) for i in range(len(files)): file = files[i] md5 = re.match(r"^([a-f0-9]{32})", file) if not md5: continue md5 = md5.group(1) print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}") rateLimit = False if not md5 in cache: rateLimit = True with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}", headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } )) as r: j = json.loads(r.read()) cache[md5] = j["posts"][0] if j["posts"] else {} json_meta = cache[md5] if not json_meta: continue tags = [] artist = "" content = { "s": "safe content", "q": "questionable content", "e": "explict content", }.get(json_meta["rating"], "") for cat in json_meta["tags"]: if cat == "artist": tag = "by " + " and ".join(json_meta["tags"]["artist"]) if config['onlyIncludeModelArtists'] and not tag in config['tags']: continue artist = tag else: for tag in json_meta["tags"][cat]: tag = tag.replace("_", " ") if tag not in config['tags']: continue if "/" in tag or ":" in tag: continue # illegal filename character if config['filter']: should = False if tag in config['filters']: continue # was break in the original script, fixed ;) for filter in config['filtersRegex']: if re.search(filter, tag): should = True continue # was break in the original script, fixed ;) if should: continue tags.append(tag) tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags']) if artist: tags.insert(0, artist) if content: tags.insert(0, content) jointmp = "" filtered = [] for i in tags: if len(jointmp + " " + i) > config['filenameLimit']: break jointmp += " " + i filtered.append(i) joined = " ".join(filtered) shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, joined))) if rateLimit and config['rateLimit']: time.sleep(config['rateLimit'] / 1000.0) # NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING with open(config['cache'], 'wb') as f: f.write(json.dumps(cache, indent='\t').encode('utf-8')) if __name__ == "__main__": parse()