stable-diffusion-utils/src/fetch.js

let FS = require("fs")
let Fetch = require("node-fetch")

let boorus = {
	"e621": {
		urls: {
			api: "https://e621.net/posts.json", // endpoint to grab tag info from
			posts: "https://e621.net/posts/", // url to show post page, only for console logging
		},
		config: {
			rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
			cookie: null, // put your account cookie here if for whatever reason you need it
		},
		posts: ( json ) => { return json.posts; }, // way to process API output into an array of posts
		post: ( json ) => { // way to process JSON into a format this script uses
			let tags = [];
			for ( let cat in json.tags ) {
				for ( let k in json.tags[cat] ) {
					tags.push(json.tags[cat][k])
				}
			}

			// need to log in to get it, reconstruct
			if ( !json.file.url ) {
				json.file.url = `https://static1.e621.net/data/${json.file.md5.slice(0,2)}/${json.file.md5.slice(2,4)}/${json.file.md5}.${json.file.ext}`
			}

			return {
				id: json.id,
				url: json.file.url,
				md5: json.file.md5,
				filename: `${json.file.md5}.${json.file.ext}`,
				tags
			};
		}
	}
}

let config = {
	source: "./data/config/fetch.json",

	booru: "e621", // booru definition to use from the above object, currently only supports e621

	query: ``, // example query if no argument is passed, kept empty so the script can scream at you for not having it tagged

	output: `./images/downloaded/`, // directory to save your files
	cache: `./data/cache.json`, // JSON file of cached tags, will speed up processing when used for the renamer script
	images: `./images/cache/`, // total cache of images, will copy if file exists here

	limit: 320, // how many posts to pull in one go
	concurrency: 4, // how many file requests to keep in flight at the same time

	userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',

	filter: true, // sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
	filters: [ // consult the preprocess.js script for examples
		"animated", // training only supports static images
	],

	skipSave: false, // useful if you want to just cache your tags before running pre-process on files you already downloaded
}

// import source
if ( FS.existsSync(config.source) ) try {
	let imp = JSON.parse( FS.readFileSync(config.source) )
	for ( let k in imp ) {
		config[k] = imp[k]
	}
	console.log(`Imported settings from "${config.source}"`)
} catch ( e ) {
	console.error(e)
}

let booru = boorus[config.booru];
// merge booru and config
for ( let k in booru.config ) if ( !config[k] ) config[k] = booru.config[k];

// load tag cache to merge into later
let cache;
try {
	cache = JSON.parse( FS.readFileSync(config.cache) )
} catch ( e ) {
	cache = {};
}

// grab requested query from arguments
let args = process.argv;
args.shift();
args.shift();
if ( args[0] ) config.query = args[0];
if ( args[1] ) config.output = args[1];


// require a query, without it you effectively have a script to download the entirety of e621
if ( !config.query ) {
	console.error("No arguments passed; example: `node fetch.js 'kemono -dog'`")
	return;
}
try {
	if ( !FS.lstatSync(config.output).isDirectory() ) {
		console.error(`specified path for output is not a directory: ${config.output}`)
		return;
	}
} catch ( e ) {
	console.error(`specified path for output is not found: ${config.output}`)
	return;
}
// clamp concurrency
if ( !config.concurrency || config.concurrency < 1 ) config.concurrency = 1;
// fetch options to use for each request
let options = {
	headers: {
		'user-agent': config.userAgent,
		'cookie': config.cookie,
	}
}

console.log(`Downloading images of tags "${config.query} to folder ${config.output}"`)

let parse = async () => {
	let posts = [];
	let last = ''; // last ID used, used for grabbing the next page
	do {
		let query = [`tags=${config.query}`]
		if ( config.limit ) query.push(`limit=${config.limit}`)
		if ( last ) query.push(`page=b${last}`)
		query = encodeURI(query.join("&"));

		let r = await Fetch( `${booru.urls.api}?${query}`, options );
		posts = booru.posts(JSON.parse(await r.text()));

		let promises = [];
		for ( let i in posts ) {
			let post = booru.post(posts[i]);

			last = `${post.id}`
			cache[post.md5] = posts[i];

			if ( FS.existsSync(`${config.output}${post.filename}`) ) {
				console.log(`Skipping existing file: ${booru.urls.posts}${post.id}`)
				continue;
			}
			if ( FS.existsSync(`${config.images}${post.filename}`) ) {
				console.log(`Copying cached file: ${booru.urls.posts}${post.id}`)
				FS.copyFileSync(`${config.images}${post.filename}`, `${config.output}${post.filename}`)
				continue;
			}

			if ( config.filter ) {
				let filtered = null;
				// nasty nested loops, dying for a go-to
				for ( let j in post.tags ) {
					tag = post.tags[j];
					for ( let k in config.filters ) {
						let filter = config.filters[k];
						if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) {
							filtered = tag;
							break;
						}
					}
					if ( filtered ) break;
				}
				if ( filtered ) {
					console.log(`Skipping filtered post: ${booru.urls.posts}${post.id}`, filtered)
					break;
				}
			}

			if ( promises.length >= config.concurrency ) {
				for ( let i in promises ) await promises[i];
				promises = [];
			}


			if ( !config.skipSave)
				promises.push(Fetch(post.url, options).then(res => new Promise((resolve, reject) => {
					const dest = FS.createWriteStream(`${config.output}${post.filename}`);
					res.body.pipe(dest);
					dest.on('close', () => {
						console.log(`Downloaded: ${booru.urls.posts}${post.id}`)

						if ( FS.existsSync(`${config.images}`) ) {
							FS.copyFileSync(`${config.output}${post.filename}`, `${config.images}${post.filename}`)
						}

						resolve()
					});
					dest.on('error', reject);
				})).catch((err)=>{
					console.error(`Error while fetching: ${post.id}`, posts[i], err);
				}));
		}

		if ( config.rateLimit ) await new Promise( (resolve) => {
			setTimeout(resolve, config.rateLimit)
		} )

		FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" ))
	} while ( posts.length );
}

parse();