stable-diffusion-utils/utils/renamer/preprocess.py

# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156

import os
import re
import json
import time
import shutil
import urllib.request

config = {
	'input': './in/', # files to process
	'output': './out/', # files to copy files to
	'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
	'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running

	'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
	'filenameLimit': 192, # maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser

	'filter': True,
	# fill it with tags of whatever you don't want to make it into the filename
	# for starters, you can also add "anthro", "male", "female", as they're very common tags
	'filters': [
		"anthro",
		"fur",
		"male",
		"female",
		"genitals",
		"video games",
		"animal genitalia",
	],
	'filtersRegex': [
		r"clothing$",
	],

	'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists
	# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)

	'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model
}

with open(config['tags'], 'rb') as f:
	csv = f.read().decode('utf-8').split("\n")
config['tags'] = {}
for i in csv:
	k, v = i.split(',')
	config['tags'][k] = int(v)

cache = {}
try:
	with open(config['cache'], 'rb') as f:
		cache = json.loads(f.read().decode('utf-8'))
except:
	pass

def parse():
	global config, cache
	files = []
	for file in os.listdir(config['input']):
		files.append(file)
	for i in range(len(files)):
		file = files[i]
		md5 = re.match(r"^([a-f0-9]{32})", file)
		if not md5:
			continue
		md5 = md5.group(1)

		print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}")

		rateLimit = False
		if not md5 in cache:
			rateLimit = True
			with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}",
				headers = {
					'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
				}
			)) as r:
				j = json.loads(r.read())
			cache[md5] = j["posts"][0] if j["posts"] else {}
		json_meta = cache[md5]
		if not json_meta:
			continue
		tags = []

		artist = ""
		content = {
			"s": "safe content",
			"q": "questionable content",
			"e": "explict content",
		}.get(json_meta["rating"], "")

		for cat in json_meta["tags"]:
			if cat == "artist":
				tag = "by " + " and ".join(json_meta["tags"]["artist"])
				if config['onlyIncludeModelArtists'] and not tag in config['tags']:
					continue
				artist = tag
			else:
				for tag in json_meta["tags"][cat]:
					tag = tag.replace("_", " ")
					if tag not in config['tags']:
						continue
					if "/" in tag or ":" in tag:
						continue # illegal filename character


					if config['filter']:
						should = False
						if tag in config['filters']:
							continue # was break in the original script, fixed ;)
						for filter in config['filtersRegex']:
							if re.search(filter, tag):
								should = True
								continue # was break in the original script, fixed ;)
						if should:
							continue

					tags.append(tag)
		tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags'])
		if artist:
			tags.insert(0, artist)
		if content:
			tags.insert(0, content)

		jointmp = ""
		filtered = []
		for i in tags:
			if len(jointmp + " " + i) > config['filenameLimit']:
				break
			jointmp += " " + i
			filtered.append(i)
		joined = " ".join(filtered)

		shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, joined)))

		if rateLimit and config['rateLimit']:
			time.sleep(config['rateLimit'] / 1000.0)

	# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
	with open(config['cache'], 'wb') as f:
		f.write(json.dumps(cache, indent='\t').encode('utf-8'))

if __name__ == "__main__":
	parse()