stable-diffusion-utils/utils/renamer/preprocess.js

135 lines
4.2 KiB
JavaScript
Executable File

let FS = require("fs")
let Fetch = require("node-fetch")
let config = {
input: `./in/`, // files to process
output: `./out/`, // files to copy files to
tags: `./tags.csv`, // csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
cache: `./cache.json`, // JSON file of cached tags, will speed up processing if re-running
rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
filenameLimit: 192, // maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser
filter: true,
// fill it with tags of whatever you don't want to make it into the filename
// for starters, you can also add "anthro", "male", "female", as they're very common tags
filters: [
"anthro",
"fur",
"male",
"female",
"genitals",
"video games",
"animal genitalia",
/clothing$/,
],
onlyIncludeModelArtists: true, // if true, only include the artist's tag if in the model's taglist, if false, add all artists
// i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)
reverseTags: false, // inverts sorting, prioritizing tags with little representation in the model
}
let files = FS.readdirSync(config.input);
let csv = FS.readFileSync(config.tags)
csv = csv.toString().split("\n")
config.tags = {}
for ( let i in csv ) {
let [k, v] = csv[i].split(",")
config.tags[k] = parseInt(v);
}
let cache;
try {
cache = JSON.parse( FS.readFileSync(config.cache) )
} catch ( e ) {
cache = {};
}
let parse = async () => {
for ( let i in files ) {
let file = files[i];
let md5 = file.match(/^([a-f0-9]{32})/);
if ( !md5 ) continue;
md5 = md5[1];
console.log(`[${(100.0 * i / files.length).toFixed(3)}%]: ${md5}`);
let rateLimit = false;
if ( !cache[md5] ) {
rateLimit = true;
let r = await Fetch( `https://e621.net/posts.json?tags=md5:${md5}`, {
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
} );
let j = JSON.parse(await r.text());
cache[md5] = j.posts[0];
}
let json = cache[md5];
if ( !json ) continue;
tags = [];
let artist = "";
let content = "";
switch ( json.rating ) {
case "s": content = "safe content"; break;
case "q": content = "questionable content"; break;
case "e": content = "explict content"; break;
}
for ( let cat in json.tags ) {
if ( cat === "artist" ) {
let tag = "by " + json.tags["artist"].join(" and ")
if ( config.onlyIncludeModelArtists && !config.tags[tag] ) continue;
artist = tag;
} else for ( let k in json.tags[cat] ) {
let tag = json.tags[cat][k].replace(/_/g, " ");
if ( !config.tags[tag] ) continue;
if ( tag.indexOf("/") >= 0 ) continue; // illegal filename character
if ( config.filter ) {
let should = false;
for ( let i in config.filters ) {
let filter = config.filters[i];
if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) {
should = true;
break;
}
}
if ( should ) continue;
}
tags.push(tag);
}
}
tags = tags.sort( (a, b) => {
let polarity = config.reverseTags ? -1 : 1;
return (config.tags[b] - config.tags[a]) * polarity;
})
if ( artist ) tags.unshift(artist);
if ( content ) tags.unshift(content);
let jointmp = "";
let filtered = [];
for ( let i in tags ) {
if ( (jointmp + " " + tags[i]).length > config.filenameLimit ) break;
jointmp += " " + tags[i];
filtered.push(tags[i])
}
let joined = filtered.join(" ")
// NOOOOOO YOU'RE SUPPOSE TO DO IT ASYNCHRONOUSLY SO IT'S NOT BLOCKING
require("fs").copyFileSync(`${config.input}/${file}`, `${config.output}/${file.replace(md5, joined)}`)
if ( rateLimit && config.rateLimit ) await new Promise( (resolve) => {
setTimeout(resolve, config.rateLimit)
} )
}
// NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" ))
}
parse();