2022-10-08 00:16:29 +00:00
# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156
import os
import re
import json
import time
import shutil
2022-10-12 03:45:16 +00:00
import math
2022-10-08 00:16:29 +00:00
import urllib . request
config = {
' input ' : ' ./in/ ' , # files to process
' output ' : ' ./out/ ' , # files to copy files to
' tags ' : ' ./tags.csv ' , # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
' cache ' : ' ./cache.json ' , # JSON file of cached tags, will speed up processing if re-running
' rateLimit ' : 500 , # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
2022-10-12 03:45:16 +00:00
' filenameLimit ' : 245 , # maximum characters to put in the filename, necessary to abide by filesystem limitations
# you can set this to 245, as the web UI has uncapped the prompt limit, but I have yet to test this if this limit was also lifted for textual inversion
2022-10-08 00:16:29 +00:00
' filter ' : True ,
# fill it with tags of whatever you don't want to make it into the filename
# for starters, you can also add "anthro", "male", "female", as they're very common tags
' filters ' : [
2022-10-12 03:45:16 +00:00
# "anthro",
# "fur",
# "male",
# "female",
" animal genitalia " ,
2022-10-08 00:16:29 +00:00
" genitals " ,
" video games " ,
] ,
' filtersRegex ' : [
r " clothing$ " ,
2022-10-12 03:45:16 +00:00
r " fluids$ " ,
r " (fe)?male$ " ,
2022-10-08 00:16:29 +00:00
] ,
2022-10-12 03:45:16 +00:00
# treat these tags as already being included in the
# if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis
# you can also include character names / series names if you're using this for hypernetworks
2022-10-12 15:53:12 +00:00
' tagsOverride ' : [ " character " , " copyright " ] , # useful for textual inversion training
# 'tagsOverride': ["character", "species", "copyright"], # useful for hypernetwork training
2022-10-12 03:45:16 +00:00
' tagsOverrideStart ' : 1000000 , # starting score that your overriden tags will start from, for sorting purposes
# tags to always include in the list
# I HIGHLY suggest including these tags in your training template instead
' tagsAutoInclude ' : [ ] ,
' removeParentheses ' : True , # removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above
# good because it messes with a lot of shit
2022-10-08 00:16:29 +00:00
' onlyIncludeModelArtists ' : True , # if True, only include the artist's tag if in the model's taglist, if false, add all artists
# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
' reverseTags ' : False , # inverts sorting, prioritizing tags with little representation in the model
2022-10-12 03:45:16 +00:00
' tagDelimiter ' : " , " , # what separates each tag in the filename, web UI will accept comma separated filenames, but will insert it without commas
2022-10-08 00:16:29 +00:00
}
with open ( config [ ' tags ' ] , ' rb ' ) as f :
csv = f . read ( ) . decode ( ' utf-8 ' ) . split ( " \n " )
config [ ' tags ' ] = { }
for i in csv :
k , v = i . split ( ' , ' )
config [ ' tags ' ] [ k ] = int ( v )
2022-10-12 03:45:16 +00:00
for i in range ( len ( config [ ' tagsOverride ' ] ) ) :
override = config [ ' tagsOverride ' ] [ i ] . replace ( " _ " , " " )
config [ ' tags ' ] [ override ] = config [ ' tagsOverrideStart ' ]
config [ ' tagsOverrideStart ' ] = config [ ' tagsOverrideStart ' ] - 1
2022-10-08 00:16:29 +00:00
cache = { }
try :
with open ( config [ ' cache ' ] , ' rb ' ) as f :
cache = json . loads ( f . read ( ) . decode ( ' utf-8 ' ) )
except :
pass
def parse ( ) :
global config , cache
files = [ ]
for file in os . listdir ( config [ ' input ' ] ) :
files . append ( file )
for i in range ( len ( files ) ) :
file = files [ i ]
2022-10-10 02:34:48 +00:00
# try filenames like "83737b5e961b594c26e8feaed301e7a5 (1).jpg" (duplicated copies from a file manager)
2022-10-08 00:16:29 +00:00
md5 = re . match ( r " ^([a-f0-9] {32} ) " , file )
if not md5 :
2022-10-10 02:34:48 +00:00
# try filenames like "00001-83737b5e961b594c26e8feaed301e7a5.jpg" (output from voldy's web UI preprocessing)
md5 = re . match ( r " ([a-f0-9] {32} ) \ .(jpe?g|png)$ " , file )
if not md5 :
continue
2022-10-08 00:16:29 +00:00
md5 = md5 . group ( 1 )
print ( f " [ { ( 100.0 * i / len ( files ) ) : 3.0f } %]: { md5 } " )
rateLimit = False
if not md5 in cache :
rateLimit = True
with urllib . request . urlopen ( urllib . request . Request ( f " https://e621.net/posts.json?tags=md5: { md5 } " ,
headers = {
' user-agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 '
}
) ) as r :
j = json . loads ( r . read ( ) )
cache [ md5 ] = j [ " posts " ] [ 0 ] if j [ " posts " ] else { }
json_meta = cache [ md5 ]
if not json_meta :
continue
2022-10-12 03:45:16 +00:00
tags = config [ ' tagsAutoInclude ' ] . copy ( )
2022-10-08 00:16:29 +00:00
artist = " "
content = {
" s " : " safe content " ,
" q " : " questionable content " ,
" e " : " explict content " ,
} . get ( json_meta [ " rating " ] , " " )
for cat in json_meta [ " tags " ] :
2022-10-12 03:45:16 +00:00
override = cat in config [ ' tagsOverride ' ]
2022-10-08 00:16:29 +00:00
if cat == " artist " :
tag = " by " + " and " . join ( json_meta [ " tags " ] [ " artist " ] )
if config [ ' onlyIncludeModelArtists ' ] and not tag in config [ ' tags ' ] :
continue
artist = tag
else :
for tag in json_meta [ " tags " ] [ cat ] :
tag = tag . replace ( " _ " , " " )
2022-10-12 03:45:16 +00:00
if not override :
override = tag in config [ ' tagsOverride ' ]
if override :
if tag not in config [ ' tags ' ] :
idx = config [ ' tagsOverride ' ] . index ( cat )
if idx > = 0 :
scale = math . pow ( 10 , len ( config [ ' tagsOverride ' ] ) - idx + 1 )
else :
scale = 1
config [ ' tags ' ] [ tag ] = config [ ' tagsOverrideStart ' ] * scale
config [ ' tagsOverrideStart ' ] = config [ ' tagsOverrideStart ' ] - 1
elif tag not in config [ ' tags ' ] :
2022-10-08 00:16:29 +00:00
continue
if " / " in tag or " : " in tag :
continue # illegal filename character
if config [ ' filter ' ] :
should = False
if tag in config [ ' filters ' ] :
continue # was break in the original script, fixed ;)
for filter in config [ ' filtersRegex ' ] :
if re . search ( filter , tag ) :
should = True
2022-10-10 20:51:34 +00:00
break
2022-10-08 00:16:29 +00:00
if should :
continue
tags . append ( tag )
tags . sort ( key = lambda x : - config [ ' tags ' ] [ x ] , reverse = config [ ' reverseTags ' ] )
if artist :
tags . insert ( 0 , artist )
if content :
tags . insert ( 0 , content )
jointmp = " "
filtered = [ ]
for i in tags :
2022-10-12 03:45:16 +00:00
if len ( jointmp + config [ ' tagDelimiter ' ] + i ) > config [ ' filenameLimit ' ] :
2022-10-08 00:16:29 +00:00
break
2022-10-12 03:45:16 +00:00
jointmp + = config [ ' tagDelimiter ' ] + i
if config [ ' removeParentheses ' ] :
i = re . sub ( r " \ (.+? \ )$ " , " " , i ) . strip ( )
2022-10-08 00:16:29 +00:00
filtered . append ( i )
2022-10-12 03:45:16 +00:00
joined = config [ ' tagDelimiter ' ] . join ( filtered )
2022-10-08 00:16:29 +00:00
2022-10-12 15:53:12 +00:00
shutil . copy ( os . path . join ( config [ ' input ' ] , file ) , os . path . join ( config [ ' output ' ] , file . replace ( md5 , " " + joined ) . strip ( ) ) )
2022-10-08 00:16:29 +00:00
if rateLimit and config [ ' rateLimit ' ] :
time . sleep ( config [ ' rateLimit ' ] / 1000.0 )
# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
with open ( config [ ' cache ' ] , ' wb ' ) as f :
f . write ( json . dumps ( cache , indent = ' \t ' ) . encode ( ' utf-8 ' ) )
if __name__ == " __main__ " :
parse ( )