2022-10-04 21:14:46 +07:00
let FS = require ( "fs" )
let Fetch = require ( "node-fetch" )
2022-10-06 00:16:32 +07:00
let config = {
2022-10-14 18:04:33 +07:00
source : ` ./data/config/preprocess.json ` ,
2022-10-12 18:35:05 +07:00
input : ` ./images/downloaded/ ` , // files to process
output : ` ./images/tagged/ ` , // files to copy files to
tags : ` ./data/tags.csv ` , // csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
cache : ` ./data/cache.json ` , // JSON file of cached tags, will speed up processing if re-running
2022-10-04 21:14:46 +07:00
2022-10-06 00:16:32 +07:00
rateLimit : 500 , // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
2022-10-12 15:53:12 +07:00
filenameLimit : 243 , // maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser
2022-10-06 00:16:32 +07:00
filter : true ,
// fill it with tags of whatever you don't want to make it into the filename
// for starters, you can also add "anthro", "male", "female", as they're very common tags
filters : [
2022-10-12 03:45:16 +07:00
// commented because it'll help hypernetworks
// "anthro",
// "fur",
// "male",
// "female",
"animal genitalia" , // redundant tag, usually anything will have the nasty dog dick tag or horse cock tag
"genitals" , // useless tag when everything will have penis or vagina
"video games" , // you hear about VIDEOGAMES
/clothing$/ , // all the various verbose clothing tags
/fluids$/ , // bodily fluids, genital fluids, etc.
/ (fe)?male$/ , // overweight male, overweight female
// /^(fe)?male /, male penetrating, female penetrating, etc.
2022-10-06 00:16:32 +07:00
] ,
2022-10-12 03:45:16 +07:00
// treat these tags as already being included in the
// if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis
// you can also include character names / series names if you're using this for hypernetworks
// you can also use this to boost a tag already defined to max priority
2022-10-14 14:23:25 +07:00
tagsOverride : [ ] ,
2022-10-12 15:53:12 +07:00
// tagsOverride: ["character", "species", "copyright"], // useful for hypernetwork training
2022-10-12 03:45:16 +07:00
tagsOverrideStart : 1000000 , // starting score that your overriden tags will start from, for sorting purposes
// tags to always include in the list
// I HIGHLY suggest including these tags in your training template instead
tagsAutoInclude : [ ] ,
removeParentheses : true , // removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above
// good because it messes with a lot of shit
2022-10-06 00:16:32 +07:00
onlyIncludeModelArtists : true , // if true, only include the artist's tag if in the model's taglist, if false, add all artists
// i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
reverseTags : false , // inverts sorting, prioritizing tags with little representation in the model
2022-10-12 03:45:16 +07:00
2023-02-08 18:59:39 +07:00
tagDelimiter : ", " , // what separates each tag in the filename, web UI will accept comma separated filenames
2022-10-25 21:02:44 +07:00
invalidCharacters : "\\/:*?\"<>|" , // characters that can't go in a filename
2023-02-08 18:59:39 +07:00
lora : false , // set to true to enable outputting for LoRA training
2022-10-06 00:16:32 +07:00
}
2022-10-14 18:04:33 +07:00
// import source
if ( FS . existsSync ( config . source ) ) try {
let imp = JSON . parse ( FS . readFileSync ( config . source ) )
for ( let k in imp ) {
config [ k ] = imp [ k ]
}
console . log ( ` Imported settings from " ${ config . source } " ` )
} catch ( e ) {
console . error ( e )
}
2022-10-06 00:16:32 +07:00
let csv = FS . readFileSync ( config . tags )
2022-10-04 21:14:46 +07:00
csv = csv . toString ( ) . split ( "\n" )
2022-10-06 00:16:32 +07:00
config . tags = { }
2022-10-04 21:14:46 +07:00
for ( let i in csv ) {
let [ k , v ] = csv [ i ] . split ( "," )
2022-10-06 00:16:32 +07:00
config . tags [ k ] = parseInt ( v ) ;
2022-10-04 21:14:46 +07:00
}
2022-10-12 03:45:16 +07:00
for ( let i in config . tagsOverride ) {
let override = config . tagsOverride [ i ] . replace ( /_/g , " " ) ;
config . tags [ override ] = config . tagsOverrideStart -- ;
}
2022-10-06 00:16:32 +07:00
let cache ;
try {
cache = JSON . parse ( FS . readFileSync ( config . cache ) )
} catch ( e ) {
cache = { } ;
}
2022-10-04 21:14:46 +07:00
2022-10-13 20:07:25 +07:00
let args = process . argv ;
args . shift ( ) ;
args . shift ( ) ;
if ( args [ 0 ] ) config . input = args [ 0 ] ;
if ( args [ 1 ] ) config . output = args [ 1 ] ;
2023-02-08 18:59:39 +07:00
if ( config . lora ) {
config . filenameLimit = 0 ;
if ( config . tagDelimiter . length == 1 ) {
config . tagDelimiter += " " ;
}
}
2022-10-13 20:07:25 +07:00
for ( let k in { "input" : null , "output" : null } ) {
try {
if ( ! FS . lstatSync ( config [ k ] ) . isDirectory ( ) ) {
console . error ( ` specified path for ${ k } is not a directory: ${ config [ k ] } ` )
return ;
}
} catch ( e ) {
console . error ( ` specified path for ${ k } is not found: ${ config [ k ] } ` )
return ;
}
}
2022-10-13 21:29:19 +07:00
let files = FS . readdirSync ( config . input ) ;
2022-10-13 20:07:25 +07:00
console . log ( ` Parsing ${ files . length } files from ${ config . input } => ${ config . output } ` )
2022-10-04 21:25:47 +07:00
let parse = async ( ) => {
for ( let i in files ) {
let file = files [ i ] ;
let md5 = file . match ( /^([a-f0-9]{32})/ ) ;
2022-10-10 18:34:56 +07:00
if ( ! md5 ) {
md5 = file . match ( /([a-f0-9]{32})\.(jpe?g|png)$/ ) ;
if ( ! md5 ) continue ;
}
2022-10-04 21:25:47 +07:00
md5 = md5 [ 1 ] ;
2022-10-06 01:29:38 +07:00
console . log ( ` [ ${ ( 100.0 * i / files . length ) . toFixed ( 3 ) } %]: ${ md5 } ` ) ;
2022-10-04 21:14:46 +07:00
2022-10-06 00:16:32 +07:00
let rateLimit = false ;
if ( ! cache [ md5 ] ) {
rateLimit = true ;
let r = await Fetch ( ` https://e621.net/posts.json?tags=md5: ${ md5 } ` , {
headers : {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
} ) ;
let j = JSON . parse ( await r . text ( ) ) ;
cache [ md5 ] = j . posts [ 0 ] ;
}
let json = cache [ md5 ] ;
2022-10-04 21:25:47 +07:00
if ( ! json ) continue ;
2022-10-12 03:45:16 +07:00
tags = [ ... config . tagsAutoInclude ] ;
2022-10-04 21:25:47 +07:00
let artist = "" ;
let content = "" ;
switch ( json . rating ) {
case "s" : content = "safe content" ; break ;
case "q" : content = "questionable content" ; break ;
case "e" : content = "explict content" ; break ;
2022-10-04 21:14:46 +07:00
}
2022-10-04 21:25:47 +07:00
for ( let cat in json . tags ) {
2022-10-12 03:45:16 +07:00
let override = config . tagsOverride . includes ( cat ) ;
2022-10-04 21:25:47 +07:00
if ( cat === "artist" ) {
2022-10-12 03:45:16 +07:00
let tag = ` by ${ json . tags [ "artist" ] . join ( " and " ) } `
2022-10-06 00:16:32 +07:00
if ( config . onlyIncludeModelArtists && ! config . tags [ tag ] ) continue ;
2022-10-04 21:25:47 +07:00
artist = tag ;
} else for ( let k in json . tags [ cat ] ) {
2022-10-06 00:16:32 +07:00
let tag = json . tags [ cat ] [ k ] . replace ( /_/g , " " ) ;
2022-10-12 03:45:16 +07:00
if ( ! override ) override = config . tagsOverride . includes ( tag )
2022-10-06 00:16:32 +07:00
2022-10-12 03:45:16 +07:00
if ( override ) {
if ( ! config . tags [ tag ] ) {
let idx = config . tagsOverride . indexOf ( cat ) ;
let scale = idx >= 0 ? Math . pow ( 10 , config . tagsOverride . length - idx + 1 ) : 1 ;
config . tags [ tag ] = ( config . tagsOverrideStart -- ) * scale ;
}
} else if ( ! config . tags [ tag ] ) continue ;
2022-10-25 21:02:44 +07:00
let filtered = false ;
for ( let i in config . invalidCharacters ) {
if ( tag . indexOf ( config . invalidCharacters [ i ] ) >= 0 ) {
filtered = true ;
break ;
}
}
2022-10-06 00:16:32 +07:00
if ( config . filter ) {
for ( let i in config . filters ) {
let filter = config . filters [ i ] ;
if ( filter === tag || ( filter instanceof RegExp && tag . match ( filter ) ) ) {
2022-10-25 21:02:44 +07:00
filtered = true ;
2022-10-06 00:16:32 +07:00
break ;
}
}
2022-10-25 21:02:44 +07:00
if ( filtered ) continue ;
2022-10-06 00:16:32 +07:00
}
2022-10-25 21:02:44 +07:00
if ( ! filtered ) tags . push ( tag ) ;
2022-10-04 21:25:47 +07:00
}
}
tags = tags . sort ( ( a , b ) => {
2022-10-06 00:16:32 +07:00
let polarity = config . reverseTags ? - 1 : 1 ;
return ( config . tags [ b ] - config . tags [ a ] ) * polarity ;
2022-10-04 21:25:47 +07:00
} )
if ( artist ) tags . unshift ( artist ) ;
if ( content ) tags . unshift ( content ) ;
let jointmp = "" ;
let filtered = [ ] ;
for ( let i in tags ) {
2023-02-08 18:59:39 +07:00
if ( config . filenameLimit && ( jointmp + config . tagDelimiter + tags [ i ] ) . length > config . filenameLimit ) break ;
2022-10-12 03:45:16 +07:00
jointmp += config . tagDelimiter + tags [ i ] ;
if ( config . removeParentheses )
tags [ i ] = tags [ i ] . replace ( /\(.+?\)$/ , "" ) . trim ( )
2022-10-04 21:25:47 +07:00
filtered . push ( tags [ i ] )
2022-10-04 21:14:46 +07:00
}
2022-10-12 03:45:16 +07:00
let joined = filtered . join ( config . tagDelimiter )
2022-10-04 21:14:46 +07:00
2023-02-08 18:59:39 +07:00
if ( config . lora ) {
FS . copyFileSync ( ` ${ config . input } / ${ file } ` , ` ${ config . output } / ${ file . replace ( md5 , i ) . trim ( ) } ` )
} else {
FS . copyFileSync ( ` ${ config . input } / ${ file } ` , ` ${ config . output } / ${ file . replace ( md5 , joined ) . trim ( ) } ` )
}
FS . writeFileSync ( ` ${ config . output } / ${ i } .txt ` , joined )
2022-10-04 21:14:46 +07:00
2022-10-06 00:16:32 +07:00
if ( rateLimit && config . rateLimit ) await new Promise ( ( resolve ) => {
setTimeout ( resolve , config . rateLimit )
2022-10-04 21:25:47 +07:00
} )
2022-10-04 21:14:46 +07:00
}
2022-10-06 00:16:32 +07:00
// NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
FS . writeFileSync ( config . cache , JSON . stringify ( cache , null , "\t" ) )
2022-10-04 21:25:47 +07:00
}
2022-10-04 21:14:46 +07:00
2022-10-04 21:25:47 +07:00
parse ( ) ;