stable-diffusion-utils/utils/renamer/preprocess.py

# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156

import os
import re
import json
import time
import shutil
import math
import urllib.request

config = {
	'input': './in/', # files to process
	'output': './out/', # files to copy files to
	'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
	'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running

	'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
	'filenameLimit': 245, # maximum characters to put in the filename, necessary to abide by filesystem limitations
	# you can set this to 245, as the web UI has uncapped the prompt limit, but I have yet to test this if this limit was also lifted for textual inversion

	'filter': True,
	# fill it with tags of whatever you don't want to make it into the filename
	# for starters, you can also add "anthro", "male", "female", as they're very common tags
	'filters': [
	#	"anthro",
	#	"fur",
	#	"male",
	#	"female",

		"animal genitalia",

		"genitals",
		"video games",
	],
	'filtersRegex': [
		r"clothing$",
		r"fluids$",
		r" (fe)?male$",
	],

	# treat these tags as already being included in the 
	# if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis
	# you can also include character names / series names if you're using this for hypernetworks
	'tagsOverride': ["character", "copyright"], # useful for textual inversion training
#	'tagsOverride': ["character", "species", "copyright"], # useful for hypernetwork training
	'tagsOverrideStart': 1000000, # starting score that your overriden tags will start from, for sorting purposes

	# tags to always include in the list
	# I HIGHLY suggest including these tags in your training template instead
	'tagsAutoInclude': [],

	'removeParentheses': True, # removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above
	# good because it messes with a lot of shit

	'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists
	# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)

	'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model

	'tagDelimiter': ",", # what separates each tag in the filename, web UI will accept comma separated filenames, but will insert it without commas
}

with open(config['tags'], 'rb') as f:
	csv = f.read().decode('utf-8').split("\n")
config['tags'] = {}
for i in csv:
	k, v = i.split(',')
	config['tags'][k] = int(v)

for i in range(len(config['tagsOverride'])):
	override = config['tagsOverride'][i].replace("_", " ")
	config['tags'][override] = config['tagsOverrideStart']
	config['tagsOverrideStart'] = config['tagsOverrideStart'] - 1

cache = {}
try:
	with open(config['cache'], 'rb') as f:
		cache = json.loads(f.read().decode('utf-8'))
except:
	pass

def parse():
	global config, cache
	files = []
	for file in os.listdir(config['input']):
		files.append(file)
	for i in range(len(files)):
		file = files[i]
		# try filenames like "83737b5e961b594c26e8feaed301e7a5 (1).jpg" (duplicated copies from a file manager)
		md5 = re.match(r"^([a-f0-9]{32})", file)
		if not md5:
			# try filenames like "00001-83737b5e961b594c26e8feaed301e7a5.jpg" (output from voldy's web UI preprocessing)
			md5 = re.match(r"([a-f0-9]{32})\.(jpe?g|png)$", file)
			if not md5:
				continue
		md5 = md5.group(1)

		print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}")

		rateLimit = False
		if not md5 in cache:
			rateLimit = True
			with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}",
				headers = {
					'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
				}
			)) as r:
				j = json.loads(r.read())
			cache[md5] = j["posts"][0] if j["posts"] else {}
		json_meta = cache[md5]
		if not json_meta:
			continue
		tags = config['tagsAutoInclude'].copy()

		artist = ""
		content = {
			"s": "safe content",
			"q": "questionable content",
			"e": "explict content",
		}.get(json_meta["rating"], "")

		for cat in json_meta["tags"]:
			override = cat in config['tagsOverride']
			if cat == "artist":
				tag = "by " + " and ".join(json_meta["tags"]["artist"])
				if config['onlyIncludeModelArtists'] and not tag in config['tags']:
					continue
				artist = tag
			else:
				for tag in json_meta["tags"][cat]:
					tag = tag.replace("_", " ")
					if not override:
						override = tag in config['tagsOverride']

					if override:
						if tag not in config['tags']:
							idx = config['tagsOverride'].index( cat )
							if idx >= 0:
								scale = math.pow(10, len(config['tagsOverride']) - idx + 1)
							else:
								scale = 1
							config['tags'][tag] = config['tagsOverrideStart'] * scale
							config['tagsOverrideStart'] = config['tagsOverrideStart'] - 1
					elif tag not in config['tags']:
						continue
					if "/" in tag or ":" in tag:
						continue # illegal filename character

					if config['filter']:
						should = False
						if tag in config['filters']:
							continue # was break in the original script, fixed ;)
						for filter in config['filtersRegex']:
							if re.search(filter, tag):
								should = True
								break
						if should:
							continue

					tags.append(tag)
		tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags'])
		if artist:
			tags.insert(0, artist)
		if content:
			tags.insert(0, content)

		jointmp = ""
		filtered = []
		for i in tags:
			if len(jointmp + config['tagDelimiter'] + i) > config['filenameLimit']:
				break
			jointmp += config['tagDelimiter'] + i
			if config['removeParentheses']:
				i = re.sub(r"\(.+?\)$", "", i).strip()
			filtered.append(i)
		joined = config['tagDelimiter'].join(filtered)

		shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, " "+joined).strip()))

		if rateLimit and config['rateLimit']:
			time.sleep(config['rateLimit'] / 1000.0)

	# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
	with open(config['cache'], 'wb') as f:
		f.write(json.dumps(cache, indent='\t').encode('utf-8'))

if __name__ == "__main__":
	parse()
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156`

			`import os`
			`import re`
			`import json`
			`import time`
			`import shutil`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`import math`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`import urllib.request`

			`config = {`
			`'input': './in/', # files to process`
			`'output': './out/', # files to copy files to`
			`'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)`
			`'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running`

			`'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`'filenameLimit': 245, # maximum characters to put in the filename, necessary to abide by filesystem limitations`
			`# you can set this to 245, as the web UI has uncapped the prompt limit, but I have yet to test this if this limit was also lifted for textual inversion`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00
			`'filter': True,`
			`# fill it with tags of whatever you don't want to make it into the filename`
			`# for starters, you can also add "anthro", "male", "female", as they're very common tags`
			`'filters': [`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`# "anthro",`
			`# "fur",`
			`# "male",`
			`# "female",`

			`"animal genitalia",`

thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`"genitals",`
			`"video games",`
			`],`
			`'filtersRegex': [`
			`r"clothing$",`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`r"fluids$",`
			`r" (fe)?male$",`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`],`

bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`# treat these tags as already being included in the`
			`# if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis`
			`# you can also include character names / series names if you're using this for hypernetworks`
slight fix, added more notes on hypernetworks 2022-10-12 15:53:12 +00:00			`'tagsOverride': ["character", "copyright"], # useful for textual inversion training`
			`# 'tagsOverride': ["character", "species", "copyright"], # useful for hypernetwork training`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`'tagsOverrideStart': 1000000, # starting score that your overriden tags will start from, for sorting purposes`

			`# tags to always include in the list`
			`# I HIGHLY suggest including these tags in your training template instead`
			`'tagsAutoInclude': [],`

			'removeParentheses': True, # removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above
			`# good because it messes with a lot of shit`

thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists`
			`# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)`

			`'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00
			`'tagDelimiter': ",", # what separates each tag in the filename, web UI will accept comma separated filenames, but will insert it without commas`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`}`

			`with open(config['tags'], 'rb') as f:`
			`csv = f.read().decode('utf-8').split("\n")`
			`config['tags'] = {}`
			`for i in csv:`
			`k, v = i.split(',')`
			`config['tags'][k] = int(v)`

bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`for i in range(len(config['tagsOverride'])):`
			`override = config['tagsOverride'][i].replace("_", " ")`
			`config['tags'][override] = config['tagsOverrideStart']`
			`config['tagsOverrideStart'] = config['tagsOverrideStart'] - 1`

thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`cache = {}`
			`try:`
			`with open(config['cache'], 'rb') as f:`
			`cache = json.loads(f.read().decode('utf-8'))`
			`except:`
			`pass`

			`def parse():`
			`global config, cache`
			`files = []`
			`for file in os.listdir(config['input']):`
			`files.append(file)`
			`for i in range(len(files)):`
			`file = files[i]`
Added some notes regarding style training, refined some ambiguous nerd-language 2022-10-10 02:34:48 +00:00			`# try filenames like "83737b5e961b594c26e8feaed301e7a5 (1).jpg" (duplicated copies from a file manager)`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`md5 = re.match(r"^([a-f0-9]{32})", file)`
			`if not md5:`
Added some notes regarding style training, refined some ambiguous nerd-language 2022-10-10 02:34:48 +00:00			`# try filenames like "00001-83737b5e961b594c26e8feaed301e7a5.jpg" (output from voldy's web UI preprocessing)`
			`md5 = re.match(r"([a-f0-9]{32})\.(jpe?g\|png)$", file)`
			`if not md5:`
			`continue`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`md5 = md5.group(1)`

			`print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}")`

			`rateLimit = False`
			`if not md5 in cache:`
			`rateLimit = True`
			`with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}",`
			`headers = {`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'`
			`}`
			`)) as r:`
			`j = json.loads(r.read())`
			`cache[md5] = j["posts"][0] if j["posts"] else {}`
			`json_meta = cache[md5]`
			`if not json_meta:`
			`continue`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`tags = config['tagsAutoInclude'].copy()`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00
			`artist = ""`
			`content = {`
			`"s": "safe content",`
			`"q": "questionable content",`
			`"e": "explict content",`
			`}.get(json_meta["rating"], "")`

			`for cat in json_meta["tags"]:`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`override = cat in config['tagsOverride']`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`if cat == "artist":`
			`tag = "by " + " and ".join(json_meta["tags"]["artist"])`
			`if config['onlyIncludeModelArtists'] and not tag in config['tags']:`
			`continue`
			`artist = tag`
			`else:`
			`for tag in json_meta["tags"][cat]:`
			`tag = tag.replace("_", " ")`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`if not override:`
			`override = tag in config['tagsOverride']`

			`if override:`
			`if tag not in config['tags']:`
			`idx = config['tagsOverride'].index( cat )`
			`if idx >= 0:`
			`scale = math.pow(10, len(config['tagsOverride']) - idx + 1)`
			`else:`
			`scale = 1`
			`config['tags'][tag] = config['tagsOverrideStart'] * scale`
			`config['tagsOverrideStart'] = config['tagsOverrideStart'] - 1`
			`elif tag not in config['tags']:`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`continue`
			`if "/" in tag or ":" in tag:`
			`continue # illegal filename character`

			`if config['filter']:`
			`should = False`
			`if tag in config['filters']:`
			`continue # was break in the original script, fixed ;)`
			`for filter in config['filtersRegex']:`
			`if re.search(filter, tag):`
			`should = True`
python script for the e621 downloader 2022-10-10 20:51:34 +00:00			`break`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`if should:`
			`continue`

			`tags.append(tag)`
			`tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags'])`
			`if artist:`
			`tags.insert(0, artist)`
			`if content:`
			`tags.insert(0, content)`

			`jointmp = ""`
			`filtered = []`
			`for i in tags:`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`if len(jointmp + config['tagDelimiter'] + i) > config['filenameLimit']:`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`break`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`jointmp += config['tagDelimiter'] + i`
			`if config['removeParentheses']:`
			`i = re.sub(r"\(.+?\)$", "", i).strip()`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00			`filtered.append(i)`
bug fixes, more features, some starting notes on hypernetworks 2022-10-12 03:45:16 +00:00			`joined = config['tagDelimiter'].join(filtered)`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00
slight fix, added more notes on hypernetworks 2022-10-12 15:53:12 +00:00			`shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, " "+joined).strip()))`
thanks anon for the python port, owe you a blowjob 2022-10-08 00:16:29 +00:00
			`if rateLimit and config['rateLimit']:`
			`time.sleep(config['rateLimit'] / 1000.0)`

			`# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING`
			`with open(config['cache'], 'wb') as f:`
			`f.write(json.dumps(cache, indent='\t').encode('utf-8'))`

			`if __name__ == "__main__":`
			`parse()`