2022-10-08 00:16:29 +00:00
# Credits to https://gist.github.com/nnuudev/56ed3242023c8582a32e3130ef59730b / https://boards.4chan.org/trash/thread/51463059#p51472156
import os
import re
import json
import time
import shutil
import urllib . request
config = {
' input ' : ' ./in/ ' , # files to process
' output ' : ' ./out/ ' , # files to copy files to
' tags ' : ' ./tags.csv ' , # csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
' cache ' : ' ./cache.json ' , # JSON file of cached tags, will speed up processing if re-running
' rateLimit ' : 500 , # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
2022-10-10 02:34:48 +00:00
' filenameLimit ' : 192 , # maximum characters to put in the filename, necessary to abide by filesystem limitations
# you can set this to 250, as the web UI has uncapped the prompt limit, but I have yet to test this if this limit was also lifted for textual inversion
2022-10-08 00:16:29 +00:00
' filter ' : True ,
# fill it with tags of whatever you don't want to make it into the filename
# for starters, you can also add "anthro", "male", "female", as they're very common tags
' filters ' : [
" anthro " ,
" fur " ,
" male " ,
" female " ,
" genitals " ,
" video games " ,
" animal genitalia " ,
] ,
' filtersRegex ' : [
r " clothing$ " ,
] ,
' onlyIncludeModelArtists ' : True , # if True, only include the artist's tag if in the model's taglist, if false, add all artists
# i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
' reverseTags ' : False , # inverts sorting, prioritizing tags with little representation in the model
}
with open ( config [ ' tags ' ] , ' rb ' ) as f :
csv = f . read ( ) . decode ( ' utf-8 ' ) . split ( " \n " )
config [ ' tags ' ] = { }
for i in csv :
k , v = i . split ( ' , ' )
config [ ' tags ' ] [ k ] = int ( v )
cache = { }
try :
with open ( config [ ' cache ' ] , ' rb ' ) as f :
cache = json . loads ( f . read ( ) . decode ( ' utf-8 ' ) )
except :
pass
def parse ( ) :
global config , cache
files = [ ]
for file in os . listdir ( config [ ' input ' ] ) :
files . append ( file )
for i in range ( len ( files ) ) :
file = files [ i ]
2022-10-10 02:34:48 +00:00
# try filenames like "83737b5e961b594c26e8feaed301e7a5 (1).jpg" (duplicated copies from a file manager)
2022-10-08 00:16:29 +00:00
md5 = re . match ( r " ^([a-f0-9] {32} ) " , file )
if not md5 :
2022-10-10 02:34:48 +00:00
# try filenames like "00001-83737b5e961b594c26e8feaed301e7a5.jpg" (output from voldy's web UI preprocessing)
md5 = re . match ( r " ([a-f0-9] {32} ) \ .(jpe?g|png)$ " , file )
if not md5 :
continue
2022-10-08 00:16:29 +00:00
md5 = md5 . group ( 1 )
print ( f " [ { ( 100.0 * i / len ( files ) ) : 3.0f } %]: { md5 } " )
rateLimit = False
if not md5 in cache :
rateLimit = True
with urllib . request . urlopen ( urllib . request . Request ( f " https://e621.net/posts.json?tags=md5: { md5 } " ,
headers = {
' user-agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 '
}
) ) as r :
j = json . loads ( r . read ( ) )
cache [ md5 ] = j [ " posts " ] [ 0 ] if j [ " posts " ] else { }
json_meta = cache [ md5 ]
if not json_meta :
continue
tags = [ ]
artist = " "
content = {
" s " : " safe content " ,
" q " : " questionable content " ,
" e " : " explict content " ,
} . get ( json_meta [ " rating " ] , " " )
for cat in json_meta [ " tags " ] :
if cat == " artist " :
tag = " by " + " and " . join ( json_meta [ " tags " ] [ " artist " ] )
if config [ ' onlyIncludeModelArtists ' ] and not tag in config [ ' tags ' ] :
continue
artist = tag
else :
for tag in json_meta [ " tags " ] [ cat ] :
tag = tag . replace ( " _ " , " " )
if tag not in config [ ' tags ' ] :
continue
if " / " in tag or " : " in tag :
continue # illegal filename character
if config [ ' filter ' ] :
should = False
if tag in config [ ' filters ' ] :
continue # was break in the original script, fixed ;)
for filter in config [ ' filtersRegex ' ] :
if re . search ( filter , tag ) :
should = True
2022-10-10 20:51:34 +00:00
break
2022-10-08 00:16:29 +00:00
if should :
continue
tags . append ( tag )
tags . sort ( key = lambda x : - config [ ' tags ' ] [ x ] , reverse = config [ ' reverseTags ' ] )
if artist :
tags . insert ( 0 , artist )
if content :
tags . insert ( 0 , content )
jointmp = " "
filtered = [ ]
for i in tags :
if len ( jointmp + " " + i ) > config [ ' filenameLimit ' ] :
break
jointmp + = " " + i
filtered . append ( i )
joined = " " . join ( filtered )
shutil . copy ( os . path . join ( config [ ' input ' ] , file ) , os . path . join ( config [ ' output ' ] , file . replace ( md5 , joined ) ) )
if rateLimit and config [ ' rateLimit ' ] :
time . sleep ( config [ ' rateLimit ' ] / 1000.0 )
# NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
with open ( config [ ' cache ' ] , ' wb ' ) as f :
f . write ( json . dumps ( cache , indent = ' \t ' ) . encode ( ' utf-8 ' ) )
if __name__ == " __main__ " :
parse ( )