thanks anon for the python port, owe you a blowjob
This commit is contained in:
parent
7856f7fae7
commit
6d78379883
19
README.md
19
README.md
|
@ -11,7 +11,6 @@ An up-to-date repo with all the necessary files can be found here: https://git.c
|
||||||
This guide assumes the following basics:
|
This guide assumes the following basics:
|
||||||
* you have a working installation of Voldy's Web UI set ui
|
* you have a working installation of Voldy's Web UI set ui
|
||||||
* you already have the furry/yiffy models downloaded
|
* you already have the furry/yiffy models downloaded
|
||||||
* you have node.js installed (**!**TODO**!** Port script to python, as everyone should have python already)
|
|
||||||
|
|
||||||
You can also extend this into any other booru-oriented model, but you'll have to modify the pre-processing script according to the site images were pulled from. The general concepts still apply.
|
You can also extend this into any other booru-oriented model, but you'll have to modify the pre-processing script according to the site images were pulled from. The general concepts still apply.
|
||||||
|
|
||||||
|
@ -35,12 +34,22 @@ These tips also can apply to training an artist's art style instead, but I've ye
|
||||||
|
|
||||||
## Pre-Processing Script
|
## Pre-Processing Script
|
||||||
|
|
||||||
**!**TODO**!**: actually test the script, and port it to Python
|
|
||||||
|
|
||||||
Below is a quick hack job from my server of the script. You're required to already have node.js and `node-fetch` version 2.x (`npm install node-fetch@2` in the folder with the script). It is ***imperative*** you install version 2, as later versions moved to needing `import` (YUCK) over `require`.
|
|
||||||
|
|
||||||
You are not required to actually run this, as this script is just a shortcut to manually renaming files and curating the tags, but it cuts the bulk work of it.
|
You are not required to actually run this, as this script is just a shortcut to manually renaming files and curating the tags, but it cuts the bulk work of it.
|
||||||
|
|
||||||
|
Included in the repo under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer) is a script for tagging images from e621 in the filename for later user in the web UI.
|
||||||
|
|
||||||
|
For little additional configuration, use the python variant: `preprocess.py` (credits to [anon](https://boards.4chan.org/trash/thread/51463059#p51472156)). Just put your images in the `./utils/renamer/in/` folder, then run the script.
|
||||||
|
|
||||||
|
You can also have multiple variations of the same images, as it's useful if you're splitting an image into multiple parts. For example, the following is valid:
|
||||||
|
```
|
||||||
|
ef331a09e313914aa0bcb2c5310660ec.jpg
|
||||||
|
aacb4870a669b0fc7e1ede0c1652fa8c (1).jpg
|
||||||
|
aacb4870a669b0fc7e1ede0c1652fa8c (2).jpg
|
||||||
|
554982d3498e67a50f768e6e18088072.jpg
|
||||||
|
554982d3498e67a50f768e6e18088072 (1).jpg
|
||||||
|
554982d3498e67a50f768e6e18088072 (2).jpg
|
||||||
|
```
|
||||||
|
|
||||||
The generalized procedure is as followed:
|
The generalized procedure is as followed:
|
||||||
* load a list of tags associated with the SD model
|
* load a list of tags associated with the SD model
|
||||||
* grab a list of filenames
|
* grab a list of filenames
|
||||||
|
|
143
utils/renamer/preprocess.py
Executable file
143
utils/renamer/preprocess.py
Executable file
|
@ -0,0 +1,143 @@
|
||||||
|
# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import shutil
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
config = {
|
||||||
|
'input': './in/', # files to process
|
||||||
|
'output': './out/', # files to copy files to
|
||||||
|
'tags': './tags.csv', # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
|
||||||
|
'cache': './cache.json', # JSON file of cached tags, will speed up processing if re-running
|
||||||
|
|
||||||
|
'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
|
||||||
|
'filenameLimit': 192, # maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser
|
||||||
|
|
||||||
|
'filter': True,
|
||||||
|
# fill it with tags of whatever you don't want to make it into the filename
|
||||||
|
# for starters, you can also add "anthro", "male", "female", as they're very common tags
|
||||||
|
'filters': [
|
||||||
|
"anthro",
|
||||||
|
"fur",
|
||||||
|
"male",
|
||||||
|
"female",
|
||||||
|
"genitals",
|
||||||
|
"video games",
|
||||||
|
"animal genitalia",
|
||||||
|
],
|
||||||
|
'filtersRegex': [
|
||||||
|
r"clothing$",
|
||||||
|
],
|
||||||
|
|
||||||
|
'onlyIncludeModelArtists': True, # if True, only include the artist's tag if in the model's taglist, if false, add all artists
|
||||||
|
# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
|
||||||
|
|
||||||
|
'reverseTags': False, # inverts sorting, prioritizing tags with little representation in the model
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(config['tags'], 'rb') as f:
|
||||||
|
csv = f.read().decode('utf-8').split("\n")
|
||||||
|
config['tags'] = {}
|
||||||
|
for i in csv:
|
||||||
|
k, v = i.split(',')
|
||||||
|
config['tags'][k] = int(v)
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
try:
|
||||||
|
with open(config['cache'], 'rb') as f:
|
||||||
|
cache = json.loads(f.read().decode('utf-8'))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse():
|
||||||
|
global config, cache
|
||||||
|
files = []
|
||||||
|
for file in os.listdir(config['input']):
|
||||||
|
files.append(file)
|
||||||
|
for i in range(len(files)):
|
||||||
|
file = files[i]
|
||||||
|
md5 = re.match(r"^([a-f0-9]{32})", file)
|
||||||
|
if not md5:
|
||||||
|
continue
|
||||||
|
md5 = md5.group(1)
|
||||||
|
|
||||||
|
print(f"[{(100.0 * i / len(files)):3.0f}%]: {md5}")
|
||||||
|
|
||||||
|
rateLimit = False
|
||||||
|
if not md5 in cache:
|
||||||
|
rateLimit = True
|
||||||
|
with urllib.request.urlopen(urllib.request.Request(f"https://e621.net/posts.json?tags=md5:{md5}",
|
||||||
|
headers = {
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
|
||||||
|
}
|
||||||
|
)) as r:
|
||||||
|
j = json.loads(r.read())
|
||||||
|
cache[md5] = j["posts"][0] if j["posts"] else {}
|
||||||
|
json_meta = cache[md5]
|
||||||
|
if not json_meta:
|
||||||
|
continue
|
||||||
|
tags = []
|
||||||
|
|
||||||
|
artist = ""
|
||||||
|
content = {
|
||||||
|
"s": "safe content",
|
||||||
|
"q": "questionable content",
|
||||||
|
"e": "explict content",
|
||||||
|
}.get(json_meta["rating"], "")
|
||||||
|
|
||||||
|
for cat in json_meta["tags"]:
|
||||||
|
if cat == "artist":
|
||||||
|
tag = "by " + " and ".join(json_meta["tags"]["artist"])
|
||||||
|
if config['onlyIncludeModelArtists'] and not tag in config['tags']:
|
||||||
|
continue
|
||||||
|
artist = tag
|
||||||
|
else:
|
||||||
|
for tag in json_meta["tags"][cat]:
|
||||||
|
tag = tag.replace("_", " ")
|
||||||
|
if tag not in config['tags']:
|
||||||
|
continue
|
||||||
|
if "/" in tag or ":" in tag:
|
||||||
|
continue # illegal filename character
|
||||||
|
|
||||||
|
|
||||||
|
if config['filter']:
|
||||||
|
should = False
|
||||||
|
if tag in config['filters']:
|
||||||
|
continue # was break in the original script, fixed ;)
|
||||||
|
for filter in config['filtersRegex']:
|
||||||
|
if re.search(filter, tag):
|
||||||
|
should = True
|
||||||
|
continue # was break in the original script, fixed ;)
|
||||||
|
if should:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tags.append(tag)
|
||||||
|
tags.sort(key=lambda x: -config['tags'][x], reverse=config['reverseTags'])
|
||||||
|
if artist:
|
||||||
|
tags.insert(0, artist)
|
||||||
|
if content:
|
||||||
|
tags.insert(0, content)
|
||||||
|
|
||||||
|
jointmp = ""
|
||||||
|
filtered = []
|
||||||
|
for i in tags:
|
||||||
|
if len(jointmp + " " + i) > config['filenameLimit']:
|
||||||
|
break
|
||||||
|
jointmp += " " + i
|
||||||
|
filtered.append(i)
|
||||||
|
joined = " ".join(filtered)
|
||||||
|
|
||||||
|
shutil.copy(os.path.join(config['input'], file), os.path.join(config['output'], file.replace(md5, joined)))
|
||||||
|
|
||||||
|
if rateLimit and config['rateLimit']:
|
||||||
|
time.sleep(config['rateLimit'] / 1000.0)
|
||||||
|
|
||||||
|
# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
|
||||||
|
with open(config['cache'], 'wb') as f:
|
||||||
|
f.write(json.dumps(cache, indent='\t').encode('utf-8'))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parse()
|
Loading…
Reference in New Issue
Block a user