thanks anon for the python port, owe you a blowjob

2022-10-08 00:16:29 +00:00 · 2022-10-08 00:16:29 +00:00 · 6d78379883
commit 6d78379883
parent 7856f7fae7
2 changed files with 157 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -11,7 +11,6 @@ An up-to-date repo with all the necessary files can be found here: https://git.c
 This guide assumes the following basics:
 * you have a working installation of Voldy's Web UI set ui
 * you already have the furry/yiffy models downloaded
-* you have node.js installed (**!**TODO**!** Port script to python, as everyone should have python already)

 You can also extend this into any other booru-oriented model, but you'll have to modify the pre-processing script according to the site images were pulled from. The general concepts still apply.

@ -35,12 +34,22 @@ These tips also can apply to training an artist's art style instead, but I've ye

 ## Pre-Processing Script

-**!**TODO**!**: actually test the script, and port it to Python
-
-Below is a quick hack job from my server of the script. You're required to already have node.js and `node-fetch` version 2.x (`npm install node-fetch@2` in the folder with the script). It is ***imperative*** you install version 2, as later versions moved to needing `import` (YUCK) over `require`.
-
 You are not required to actually run this, as this script is just a shortcut to manually renaming files and curating the tags, but it cuts the bulk work of it.

+Included in the repo under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer) is a script for tagging images from e621 in the filename for later user in the web UI.
+
+For little additional configuration, use the python variant: `preprocess.py` (credits to [anon](https://boards.4chan.org/trash/thread/51463059#p51472156)). Just put your images in the `./utils/renamer/in/` folder, then run the script.
+
+You can also have multiple variations of the same images, as it's useful if you're splitting an image into multiple parts. For example, the following is valid:
+```
+ef331a09e313914aa0bcb2c5310660ec.jpg
+aacb4870a669b0fc7e1ede0c1652fa8c (1).jpg
+aacb4870a669b0fc7e1ede0c1652fa8c (2).jpg
+554982d3498e67a50f768e6e18088072.jpg
+554982d3498e67a50f768e6e18088072 (1).jpg
+554982d3498e67a50f768e6e18088072 (2).jpg
+```
+
 The generalized procedure is as followed:
 * load a list of tags associated with the SD model
 * grab a list of filenames
--- a/utils/renamer/preprocess.py
+++ b/utils/renamer/preprocess.py
@ -0,0 +1,143 @@
+# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156
+
+import os
+import re
+import json
+import time
+import shutil
+import urllib.request
+
+config = {
+	'input': './in/', # files to process
+	'output': './out/', # files to copy files to
+	'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
+	'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running
+
+	'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
+	'filenameLimit': 192, # maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser
+
+	'filter': True,
+	# fill it with tags of whatever you don't want to make it into the filename
+	# for starters, you can also add "anthro", "male", "female", as they're very common tags
+	'filters': [
+		"anthro",
+		"fur",
+		"male",
+		"female",
+		"genitals",
+		"video games",
+		"animal genitalia",
+	],
+	'filtersRegex': [
+		r"clothing$",
+	],
+
+	'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists
+	# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
+
+	'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model
+}
+
+with open(config['tags'], 'rb') as f:
+	csv = f.read().decode('utf-8').split("\n")
+config['tags'] = {}
+for i in csv:
+	k, v = i.split(',')
+	config['tags'][k] = int(v)
+
+cache = {}
+try:
+	with open(config['cache'], 'rb') as f:
+		cache = json.loads(f.read().decode('utf-8'))
+except:
+	pass
+
+def parse():
+	global config, cache
+	files = []
+	for file in os.listdir(config['input']):
+		files.append(file)
+	for i in range(len(files)):
+		file = files[i]
+		md5 = re.match(r"^([a-f0-9]{32})", file)
+		if not md5:
+			continue
+		md5 = md5.group(1)
+
+		print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}")
+
+		rateLimit = False
+		if not md5 in cache:
+			rateLimit = True
+			with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}",
+				headers = {
+					'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
+				}
+			)) as r:
+				j = json.loads(r.read())
+			cache[md5] = j["posts"][0] if j["posts"] else {}
+		json_meta = cache[md5]
+		if not json_meta:
+			continue
+		tags = []
+
+		artist = ""
+		content = {
+			"s": "safe content",
+			"q": "questionable content",
+			"e": "explict content",
+		}.get(json_meta["rating"], "")
+
+		for cat in json_meta["tags"]:
+			if cat == "artist":
+				tag = "by " + " and ".join(json_meta["tags"]["artist"])
+				if config['onlyIncludeModelArtists'] and not tag in config['tags']:
+					continue
+				artist = tag
+			else:
+				for tag in json_meta["tags"][cat]:
+					tag = tag.replace("_", " ")
+					if tag not in config['tags']:
+						continue
+					if "/" in tag or ":" in tag:
+						continue # illegal filename character
+
+
+					if config['filter']:
+						should = False
+						if tag in config['filters']:
+							continue # was break in the original script, fixed ;)
+						for filter in config['filtersRegex']:
+							if re.search(filter, tag):
+								should = True
+								continue # was break in the original script, fixed ;)
+						if should:
+							continue
+
+					tags.append(tag)
+		tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags'])
+		if artist:
+			tags.insert(0, artist)
+		if content:
+			tags.insert(0, content)
+
+		jointmp = ""
+		filtered = []
+		for i in tags:
+			if len(jointmp + " " + i) > config['filenameLimit']:
+				break
+			jointmp += " " + i
+			filtered.append(i)
+		joined = " ".join(filtered)
+
+		shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, joined)))
+
+		if rateLimit and config['rateLimit']:
+			time.sleep(config['rateLimit'] / 1000.0)
+
+	# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
+	with open(config['cache'], 'wb') as f:
+		f.write(json.dumps(cache, indent='\t').encode('utf-8'))
+
+if __name__ == "__main__":
+	parse()