let FS = require("fs") let Fetch = require("node-fetch") let config = { source: `./data/config/preprocess.json`, input: `./images/downloaded/`, // files to process output: `./images/tagged/`, // files to copy files to tags: `./data/tags.csv`, // csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against) cache: `./data/cache.json`, // JSON file of cached tags, will speed up processing if re-running rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second filenameLimit: 243, // maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser filter: true, // fill it with tags of whatever you don't want to make it into the filename // for starters, you can also add "anthro", "male", "female", as they're very common tags filters: [ // commented because it'll help hypernetworks // "anthro", // "fur", // "male", // "female", "animal genitalia", // redundant tag, usually anything will have the nasty dog dick tag or horse cock tag "genitals", // useless tag when everything will have penis or vagina "video games", // you hear about VIDEOGAMES /clothing$/, // all the various verbose clothing tags /fluids$/, // bodily fluids, genital fluids, etc. / (fe)?male$/, // overweight male, overweight female // /^(fe)?male /, male penetrating, female penetrating, etc. ], // treat these tags as already being included in the // if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis // you can also include character names / series names if you're using this for hypernetworks // you can also use this to boost a tag already defined to max priority tagsOverride: [], // tagsOverride: ["character", "species", "copyright"], // useful for hypernetwork training tagsOverrideStart: 1000000, // starting score that your overriden tags will start from, for sorting purposes // tags to always include in the list // I HIGHLY suggest including these tags in your training template instead tagsAutoInclude: [], removeParentheses: true, // removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above // good because it messes with a lot of shit onlyIncludeModelArtists: true, // if true, only include the artist's tag if in the model's taglist, if false, add all artists // i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD) reverseTags: false, // inverts sorting, prioritizing tags with little representation in the model tagDelimiter: ", ", // what separates each tag in the filename, web UI will accept comma separated filenames invalidCharacters: "\\/:*?\"<>|", // characters that can't go in a filename lora: false, // set to true to enable outputting for LoRA training } // import source if ( FS.existsSync(config.source) ) try { let imp = JSON.parse( FS.readFileSync(config.source) ) for ( let k in imp ) { config[k] = imp[k] } console.log(`Imported settings from "${config.source}"`) } catch ( e ) { console.error(e) } let csv = FS.readFileSync(config.tags) csv = csv.toString().split("\n") config.tags = {} for ( let i in csv ) { let [k, v] = csv[i].split(",") config.tags[k] = parseInt(v); } for ( let i in config.tagsOverride ) { let override = config.tagsOverride[i].replace(/_/g, " "); config.tags[override] = config.tagsOverrideStart--; } let cache; try { cache = JSON.parse( FS.readFileSync(config.cache) ) } catch ( e ) { cache = {}; } let args = process.argv; args.shift(); args.shift(); if ( args[0] ) config.input = args[0]; if ( args[1] ) config.output = args[1]; if ( config.lora ) { config.filenameLimit = 0; if ( config.tagDelimiter.length == 1 ) { config.tagDelimiter += " "; } } for ( let k in {"input":null, "output":null} ) { try { if ( !FS.lstatSync(config[k]).isDirectory() ) { console.error(`specified path for ${k} is not a directory: ${config[k]}`) return; } } catch ( e ) { console.error(`specified path for ${k} is not found: ${config[k]}`) return; } } let files = FS.readdirSync(config.input); console.log(`Parsing ${files.length} files from ${config.input} => ${config.output}`) let parse = async () => { for ( let i in files ) { let file = files[i]; let md5 = file.match(/^([a-f0-9]{32})/); if ( !md5 ) { md5 = file.match(/([a-f0-9]{32})\.(jpe?g|png)$/); if ( !md5 ) continue; } md5 = md5[1]; console.log(`[${(100.0 * i / files.length).toFixed(3)}%]: ${md5}`); let rateLimit = false; if ( !cache[md5] ) { rateLimit = true; let r = await Fetch( `https://e621.net/posts.json?tags=md5:${md5}`, { headers: { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } } ); let j = JSON.parse(await r.text()); cache[md5] = j.posts[0]; } let json = cache[md5]; if ( !json ) continue; tags = [...config.tagsAutoInclude]; let artist = ""; let content = ""; switch ( json.rating ) { case "s": content = "safe content"; break; case "q": content = "questionable content"; break; case "e": content = "explict content"; break; } for ( let cat in json.tags ) { let override = config.tagsOverride.includes(cat); if ( cat === "artist" ) { let tag = `by ${json.tags["artist"].join(" and ")}` if ( config.onlyIncludeModelArtists && !config.tags[tag] ) continue; artist = tag; } else for ( let k in json.tags[cat] ) { let tag = json.tags[cat][k].replace(/_/g, " "); if ( !override ) override = config.tagsOverride.includes(tag) if ( override ) { if ( !config.tags[tag] ) { let idx = config.tagsOverride.indexOf( cat ); let scale = idx >= 0 ? Math.pow( 10, config.tagsOverride.length - idx + 1 ) : 1; config.tags[tag] = (config.tagsOverrideStart--) * scale; } } else if ( !config.tags[tag] ) continue; let filtered = false; for ( let i in config.invalidCharacters ) { if ( tag.indexOf(config.invalidCharacters[i]) >= 0 ) { filtered = true; break; } } if ( config.filter ) { for ( let i in config.filters ) { let filter = config.filters[i]; if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) { filtered = true; break; } } if ( filtered ) continue; } if ( !filtered ) tags.push(tag); } } tags = tags.sort( (a, b) => { let polarity = config.reverseTags ? -1 : 1; return (config.tags[b] - config.tags[a]) * polarity; }) if ( artist ) tags.unshift(artist); if ( content ) tags.unshift(content); let jointmp = ""; let filtered = []; for ( let i in tags ) { if ( config.filenameLimit && (jointmp + config.tagDelimiter + tags[i]).length > config.filenameLimit ) break; jointmp += config.tagDelimiter + tags[i]; if ( config.removeParentheses ) tags[i] = tags[i].replace(/\(.+?\)$/, "").trim() filtered.push(tags[i]) } let joined = filtered.join(config.tagDelimiter) if ( config.lora ) { FS.copyFileSync(`${config.input}/${file}`, `${config.output}/${file.replace(md5, i).trim()}`) } else { FS.copyFileSync(`${config.input}/${file}`, `${config.output}/${file.replace(md5, joined).trim()}`) } FS.writeFileSync(`${config.output}/${i}.txt`, joined) if ( rateLimit && config.rateLimit ) await new Promise( (resolve) => { setTimeout(resolve, config.rateLimit) } ) } // NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" )) } parse();