stable-diffusion-utils/src/fetch.js

203 lines
6.3 KiB
JavaScript

let FS = require("fs")
let Fetch = require("node-fetch")
let boorus = {
"e621": {
urls: {
api: "https://e621.net/posts.json", // endpoint to grab tag info from
posts: "https://e621.net/posts/", // url to show post page, only for console logging
},
config: {
rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
cookie: null, // put your account cookie here if for whatever reason you need it
},
posts: ( json ) => { return json.posts; }, // way to process API output into an array of posts
post: ( json ) => { // way to process JSON into a format this script uses
let tags = [];
for ( let cat in json.tags ) {
for ( let k in json.tags[cat] ) {
tags.push(json.tags[cat][k])
}
}
// need to log in to get it, reconstruct
if ( !json.file.url ) {
json.file.url = `https://static1.e621.net/data/${json.file.md5.slice(0,2)}/${json.file.md5.slice(2,4)}/${json.file.md5}.${json.file.ext}`
}
return {
id: json.id,
url: json.file.url,
md5: json.file.md5,
filename: `${json.file.md5}.${json.file.ext}`,
tags
};
}
}
}
let config = {
source: "./data/config/fetch.json",
booru: "e621", // booru definition to use from the above object, currently only supports e621
query: ``, // example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
output: `./images/downloaded/`, // directory to save your files
cache: `./data/cache.json`, // JSON file of cached tags, will speed up processing when used for the renamer script
images: `./images/cache/`, // total cache of images, will copy if file exists here
limit: 320, // how many posts to pull in one go
concurrency: 4, // how many file requests to keep in flight at the same time
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
filter: true, // sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
filters: [ // consult the preprocess.js script for examples
"animated", // training only supports static images
],
skipSave: false, // useful if you want to just cache your tags before running pre-process on files you already downloaded
}
// import source
if ( FS.existsSync(config.source) ) try {
let imp = JSON.parse( FS.readFileSync(config.source) )
for ( let k in imp ) {
config[k] = imp[k]
}
console.log(`Imported settings from "${config.source}"`)
} catch ( e ) {
console.error(e)
}
let booru = boorus[config.booru];
// merge booru and config
for ( let k in booru.config ) if ( !config[k] ) config[k] = booru.config[k];
// load tag cache to merge into later
let cache;
try {
cache = JSON.parse( FS.readFileSync(config.cache) )
} catch ( e ) {
cache = {};
}
// grab requested query from arguments
let args = process.argv;
args.shift();
args.shift();
if ( args[0] ) config.query = args[0];
if ( args[1] ) config.output = args[1];
// require a query, without it you effectively have a script to download the entirety of e621
if ( !config.query ) {
console.error("No arguments passed; example: `node fetch.js 'kemono -dog'`")
return;
}
try {
if ( !FS.lstatSync(config.output).isDirectory() ) {
console.error(`specified path for output is not a directory: ${config.output}`)
return;
}
} catch ( e ) {
console.error(`specified path for output is not found: ${config.output}`)
return;
}
// clamp concurrency
if ( !config.concurrency || config.concurrency < 1 ) config.concurrency = 1;
// fetch options to use for each request
let options = {
headers: {
'user-agent': config.userAgent,
'cookie': config.cookie,
}
}
console.log(`Downloading images of tags "${config.query} to folder ${config.output}"`)
let parse = async () => {
let posts = [];
let last = ''; // last ID used, used for grabbing the next page
do {
let query = [`tags=${config.query}`]
if ( config.limit ) query.push(`limit=${config.limit}`)
if ( last ) query.push(`page=b${last}`)
query = encodeURI(query.join("&"));
let r = await Fetch( `${booru.urls.api}?${query}`, options );
posts = booru.posts(JSON.parse(await r.text()));
let promises = [];
for ( let i in posts ) {
let post = booru.post(posts[i]);
last = `${post.id}`
cache[post.md5] = posts[i];
if ( FS.existsSync(`${config.output}${post.filename}`) ) {
console.log(`Skipping existing file: ${booru.urls.posts}${post.id}`)
continue;
}
if ( FS.existsSync(`${config.images}${post.filename}`) ) {
console.log(`Copying cached file: ${booru.urls.posts}${post.id}`)
FS.copyFileSync(`${config.images}${post.filename}`, `${config.output}${post.filename}`)
continue;
}
if ( config.filter ) {
let filtered = null;
// nasty nested loops, dying for a go-to
for ( let j in post.tags ) {
tag = post.tags[j];
for ( let k in config.filters ) {
let filter = config.filters[k];
if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) {
filtered = tag;
break;
}
}
if ( filtered ) break;
}
if ( filtered ) {
console.log(`Skipping filtered post: ${booru.urls.posts}${post.id}`, filtered)
break;
}
}
if ( promises.length >= config.concurrency ) {
for ( let i in promises ) await promises[i];
promises = [];
}
if ( !config.skipSave)
promises.push(Fetch(post.url, options).then(res => new Promise((resolve, reject) => {
const dest = FS.createWriteStream(`${config.output}${post.filename}`);
res.body.pipe(dest);
dest.on('close', () => {
console.log(`Downloaded: ${booru.urls.posts}${post.id}`)
if ( FS.existsSync(`${config.images}`) ) {
FS.copyFileSync(`${config.output}${post.filename}`, `${config.images}${post.filename}`)
}
resolve()
});
dest.on('error', reject);
})).catch((err)=>{
console.error(`Error while fetching: ${post.id}`, posts[i], err);
}));
}
if ( config.rateLimit ) await new Promise( (resolve) => {
setTimeout(resolve, config.rateLimit)
} )
FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" ))
} while ( posts.length );
}
parse();