diff --git a/README.md b/README.md index f6ec135..89dcfcf 100755 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ An up-to-date repo with all the necessary files can be found here: https://git.c This guide assumes the following basics: * you have a working installation of Voldy's Web UI set ui * you already have the furry/yiffy models downloaded -* you have node.js installed (**!**TODO**!** Port script to python, as everyone should have python already) You can also extend this into any other booru-oriented model, but you'll have to modify the pre-processing script according to the site images were pulled from. The general concepts still apply. @@ -35,12 +34,22 @@ These tips also can apply to training an artist's art style instead, but I've ye ## Pre-Processing Script -**!**TODO**!**: actually test the script, and port it to Python - -Below is a quick hack job from my server of the script. You're required to already have node.js and `node-fetch` version 2.x (`npm install node-fetch@2` in the folder with the script). It is ***imperative*** you install version 2, as later versions moved to needing `import` (YUCK) over `require`. - You are not required to actually run this, as this script is just a shortcut to manually renaming files and curating the tags, but it cuts the bulk work of it. +Included in the repo under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer) is a script for tagging images from e621 in the filename for later user in the web UI. + +For little additional configuration, use the python variant: `preprocess.py` (credits to [anon](https://boards.4chan.org/trash/thread/51463059#p51472156)). Just put your images in the `./utils/renamer/in/` folder, then run the script. + +You can also have multiple variations of the same images, as it's useful if you're splitting an image into multiple parts. For example, the following is valid: +``` +ef331a09e313914aa0bcb2c5310660ec.jpg +aacb4870a669b0fc7e1ede0c1652fa8c (1).jpg +aacb4870a669b0fc7e1ede0c1652fa8c (2).jpg +554982d3498e67a50f768e6e18088072.jpg +554982d3498e67a50f768e6e18088072 (1).jpg +554982d3498e67a50f768e6e18088072 (2).jpg +``` + The generalized procedure is as followed: * load a list of tags associated with the SD model * grab a list of filenames diff --git a/utils/renamer/preprocess.py b/utils/renamer/preprocess.py new file mode 100755 index 0000000..0780b3d --- /dev/null +++ b/utils/renamer/preprocess.py @@ -0,0 +1,143 @@ +# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156 + +import os +import re +import json +import time +import shutil +import urllib.request + +config = { + 'input': './in/', # files to process + 'output': './out/', # files to copy files to + 'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against) + 'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running + + 'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second + 'filenameLimit': 192, # maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser + + 'filter': True, + # fill it with tags of whatever you don't want to make it into the filename + # for starters, you can also add "anthro", "male", "female", as they're very common tags + 'filters': [ + "anthro", + "fur", + "male", + "female", + "genitals", + "video games", + "animal genitalia", + ], + 'filtersRegex': [ + r"clothing$", + ], + + 'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists + # i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD) + + 'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model +} + +with open(config['tags'], 'rb') as f: + csv = f.read().decode('utf-8').split("\n") +config['tags'] = {} +for i in csv: + k, v = i.split(',') + config['tags'][k] = int(v) + +cache = {} +try: + with open(config['cache'], 'rb') as f: + cache = json.loads(f.read().decode('utf-8')) +except: + pass + +def parse(): + global config, cache + files = [] + for file in os.listdir(config['input']): + files.append(file) + for i in range(len(files)): + file = files[i] + md5 = re.match(r"^([a-f0-9]{32})", file) + if not md5: + continue + md5 = md5.group(1) + + print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}") + + rateLimit = False + if not md5 in cache: + rateLimit = True + with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}", + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' + } + )) as r: + j = json.loads(r.read()) + cache[md5] = j["posts"][0] if j["posts"] else {} + json_meta = cache[md5] + if not json_meta: + continue + tags = [] + + artist = "" + content = { + "s": "safe content", + "q": "questionable content", + "e": "explict content", + }.get(json_meta["rating"], "") + + for cat in json_meta["tags"]: + if cat == "artist": + tag = "by " + " and ".join(json_meta["tags"]["artist"]) + if config['onlyIncludeModelArtists'] and not tag in config['tags']: + continue + artist = tag + else: + for tag in json_meta["tags"][cat]: + tag = tag.replace("_", " ") + if tag not in config['tags']: + continue + if "/" in tag or ":" in tag: + continue # illegal filename character + + + if config['filter']: + should = False + if tag in config['filters']: + continue # was break in the original script, fixed ;) + for filter in config['filtersRegex']: + if re.search(filter, tag): + should = True + continue # was break in the original script, fixed ;) + if should: + continue + + tags.append(tag) + tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags']) + if artist: + tags.insert(0, artist) + if content: + tags.insert(0, content) + + jointmp = "" + filtered = [] + for i in tags: + if len(jointmp + " " + i) > config['filenameLimit']: + break + jointmp += " " + i + filtered.append(i) + joined = " ".join(filtered) + + shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, joined))) + + if rateLimit and config['rateLimit']: + time.sleep(config['rateLimit'] / 1000.0) + + # NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING + with open(config['cache'], 'wb') as f: + f.write(json.dumps(cache, indent='\t').encode('utf-8')) + +if __name__ == "__main__": + parse() \ No newline at end of file