stable-diffusion-utils/src/preprocess.js

let FS = require("fs")
let Fetch = require("node-fetch")

let config = {
	source: `./data/config/preprocess.json`,

	input: `./images/downloaded/`, // files to process
	output: `./images/tagged/`, // files to copy files to
	tags: `./data/tags.csv`, // csv of tags associated with the yiffy model (replace for other flavor of booru's taglist associated with the model you're training against)
	cache: `./data/cache.json`, // JSON file of cached tags, will speed up processing if re-running

	rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
	filenameLimit: 243, // maximum characters to put in the filename, necessary to abide by filesystem limitations, and to "limit" token count for the prompt parser

	filter: true,
	// fill it with tags of whatever you don't want to make it into the filename
	// for starters, you can also add "anthro", "male", "female", as they're very common tags
	filters: [
		// commented because it'll help hypernetworks
	//	"anthro",
	//	"fur",
	//	"male",
	//	"female",

		"animal genitalia", // redundant tag, usually anything will have the nasty dog dick tag or horse cock tag

		"genitals", // useless tag when everything will have penis or vagina
		"video games", // you hear about VIDEOGAMES
		/clothing$/, // all the various verbose clothing tags
		/fluids$/, // bodily fluids, genital fluids, etc.
		/ (fe)?male$/, // overweight male, overweight female

	//	/^(fe)?male /, male penetrating, female penetrating, etc.
	],

	// treat these tags as already being included in the
	// if you're cautious (paranoid), include species you want, but I found I don't really even need to include specis
	// you can also include character names / series names if you're using this for hypernetworks
	// you can also use this to boost a tag already defined to max priority

	tagsOverride: [],
//	tagsOverride: ["character", "species", "copyright"], // useful for hypernetwork training
	tagsOverrideStart: 1000000, // starting score that your overriden tags will start from, for sorting purposes

	// tags to always include in the list
	// I HIGHLY suggest including these tags in your training template instead
	tagsAutoInclude: [],

	removeParentheses: true, // removes shit like `blaidd_(elden_ring)` or `curt_(animal_crossing)` without needing to specify it all in the above
	// good because it messes with a lot of shit

	onlyIncludeModelArtists: true, // if true, only include the artist's tag if in the model's taglist, if false, add all artists
	// i've noticed some artists that weren't included in the taglist, but is available in LAION's (vanilla SD)

	reverseTags: false, // inverts sorting, prioritizing tags with little representation in the model

	tagDelimiter: ", ", // what separates each tag in the filename, web UI will accept comma separated filenames

	invalidCharacters: "\\/:*?\"<>|", // characters that can't go in a filename

	lora: false, // set to true to enable outputting for LoRA training
}

// import source
if ( FS.existsSync(config.source) ) try {
	let imp = JSON.parse( FS.readFileSync(config.source) )
	for ( let k in imp ) {
		config[k] = imp[k]
	}
	console.log(`Imported settings from "${config.source}"`)
} catch ( e ) {
	console.error(e)
}

let csv = FS.readFileSync(config.tags)
csv = csv.toString().split("\n")
config.tags = {}
for ( let i in csv ) {
	let [k, v] = csv[i].split(",")
	config.tags[k] = parseInt(v);
}

for ( let i in config.tagsOverride ) {
	let override = config.tagsOverride[i].replace(/_/g, " ");
	config.tags[override] = config.tagsOverrideStart--;
}

let cache;
try {
	cache = JSON.parse( FS.readFileSync(config.cache) )
} catch ( e ) {
	cache = {};
}

let args = process.argv;
args.shift();
args.shift();

if ( args[0] ) config.input = args[0];
if ( args[1] ) config.output = args[1];

if ( config.lora ) {
	config.filenameLimit = 0;
	if ( config.tagDelimiter.length == 1 ) {
		config.tagDelimiter += " ";
	}
}

for ( let k in {"input":null, "output":null} ) {
	try {
		if ( !FS.lstatSync(config[k]).isDirectory() ) {
			console.error(`specified path for ${k} is not a directory: ${config[k]}`)
			return;
		}
	} catch ( e ) {
		console.error(`specified path for ${k} is not found: ${config[k]}`)
		return;
	}
}

let files = FS.readdirSync(config.input);
console.log(`Parsing ${files.length} files from ${config.input} => ${config.output}`)

let parse = async () => {
	for ( let i in files ) {
		let file = files[i];
		let md5 = file.match(/^([a-f0-9]{32})/);
		if ( !md5 ) {
			md5 = file.match(/([a-f0-9]{32})\.(jpe?g|png)$/);
			if ( !md5 ) continue;
		}
		md5 = md5[1];

		console.log(`[${(100.0 * i / files.length).toFixed(3)}%]: ${md5}`);

		let rateLimit = false;
		if ( !cache[md5] ) {
			rateLimit = true;
			let r = await Fetch( `https://e621.net/posts.json?tags=md5:${md5}`, {
				headers: {
					'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
				}
			} );
			let j = JSON.parse(await r.text());
			cache[md5] = j.posts[0];
		}
		let json = cache[md5];
		if ( !json ) continue;
		tags = [...config.tagsAutoInclude];

		let artist = "";
		let content = "";
		switch ( json.rating ) {
			case "s": content = "safe content"; break;
			case "q": content = "questionable content"; break;
			case "e": content = "explict content"; break;
		}

		for ( let cat in json.tags ) {
			let override = config.tagsOverride.includes(cat);
			if ( cat === "artist" ) {
				let tag = `by ${json.tags["artist"].join(" and ")}`
				if ( config.onlyIncludeModelArtists && !config.tags[tag] ) continue;
				artist = tag;
			} else for ( let k in json.tags[cat] ) {
				let tag = json.tags[cat][k].replace(/_/g, " ");
				if ( !override ) override = config.tagsOverride.includes(tag)

				if ( override ) {
					if ( !config.tags[tag] ) {
						let idx = config.tagsOverride.indexOf( cat );
						let scale = idx >= 0 ? Math.pow( 10, config.tagsOverride.length - idx + 1 ) : 1;
						config.tags[tag] = (config.tagsOverrideStart--) * scale;
					}
				} else if ( !config.tags[tag] ) continue;

				let filtered = false;
				for ( let i in config.invalidCharacters ) {
					if ( tag.indexOf(config.invalidCharacters[i]) >= 0 ) {
						filtered = true;
						break;
					}
				}

				if ( config.filter ) {
					for ( let i in config.filters ) {
						let filter = config.filters[i];
						if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) {
							filtered = true;
							break;
						}
					}
					if ( filtered ) continue;
				}

				if ( !filtered ) tags.push(tag);
			}
		}
		tags = tags.sort( (a, b) => {
			let polarity = config.reverseTags ? -1 : 1;
			return (config.tags[b] - config.tags[a]) * polarity;
		})
		if ( artist ) tags.unshift(artist);
		if ( content ) tags.unshift(content);

		let jointmp = "";
		let filtered = [];
		for ( let i in tags ) {
			if ( config.filenameLimit && (jointmp + config.tagDelimiter + tags[i]).length > config.filenameLimit ) break;
			jointmp += config.tagDelimiter + tags[i];
			if ( config.removeParentheses )
				tags[i] = tags[i].replace(/\(.+?\)$/, "").trim()
			filtered.push(tags[i])
		}
		let joined = filtered.join(config.tagDelimiter)

		if ( config.lora ) {
			FS.copyFileSync(`${config.input}/${file}`, `${config.output}/${file.replace(md5, i).trim()}`)
		} else {
			FS.copyFileSync(`${config.input}/${file}`, `${config.output}/${file.replace(md5, joined).trim()}`)
		}
		FS.writeFileSync(`${config.output}/${i}.txt`, joined)

		if ( rateLimit && config.rateLimit ) await new Promise( (resolve) => {
			setTimeout(resolve, config.rateLimit)
		} )
	}

	// NOOOOOOOO YOU'RE WASTING SPACE BY PRETTIFYING
	FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" ))
}

parse();