let FS = require("fs") let Fetch = require("node-fetch") let boorus = { "e621": { urls: { api: "https://e621.net/posts.json", // endpoint to grab tag info from posts: "https://e621.net/posts/", // url to show post page, only for console logging }, config: { rateLimit: 500, // time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second cookie: null, // put your account cookie here if for whatever reason you need it }, posts: ( json ) => { return json.posts; }, // way to process API output into an array of posts post: ( json ) => { // way to process JSON into a format this script uses let tags = []; for ( let cat in json.tags ) { for ( let k in json.tags[cat] ) { tags.push(json.tags[cat][k]) } } // need to log in to get it, reconstruct if ( !json.file.url ) { json.file.url = `https://static1.e621.net/data/${json.file.md5.slice(0,2)}/${json.file.md5.slice(2,4)}/${json.file.md5}.${json.file.ext}` } return { id: json.id, url: json.file.url, md5: json.file.md5, filename: `${json.file.md5}.${json.file.ext}`, tags }; } } } let config = { source: "./data/config/fetch.json", booru: "e621", // booru definition to use from the above object, currently only supports e621 query: ``, // example query if no argument is passed, kept empty so the script can scream at you for not having it tagged output: `./images/downloaded/`, // directory to save your files cache: `./data/cache.json`, // JSON file of cached tags, will speed up processing when used for the renamer script images: `./images/cache/`, // total cache of images, will copy if file exists here limit: 320, // how many posts to pull in one go concurrency: 4, // how many file requests to keep in flight at the same time userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', filter: true, // sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it filters: [ // consult the preprocess.js script for examples "animated", // training only supports static images ], skipSave: false, // useful if you want to just cache your tags before running pre-process on files you already downloaded } // import source if ( FS.existsSync(config.source) ) try { let imp = JSON.parse( FS.readFileSync(config.source) ) for ( let k in imp ) { config[k] = imp[k] } console.log(`Imported settings from "${config.source}"`) } catch ( e ) { console.error(e) } let booru = boorus[config.booru]; // merge booru and config for ( let k in booru.config ) if ( !config[k] ) config[k] = booru.config[k]; // load tag cache to merge into later let cache; try { cache = JSON.parse( FS.readFileSync(config.cache) ) } catch ( e ) { cache = {}; } // grab requested query from arguments let args = process.argv; args.shift(); args.shift(); if ( args[0] ) config.query = args[0]; if ( args[1] ) config.output = args[1]; // require a query, without it you effectively have a script to download the entirety of e621 if ( !config.query ) { console.error("No arguments passed; example: `node fetch.js 'kemono -dog'`") return; } try { if ( !FS.lstatSync(config.output).isDirectory() ) { console.error(`specified path for output is not a directory: ${config.output}`) return; } } catch ( e ) { console.error(`specified path for output is not found: ${config.output}`) return; } // clamp concurrency if ( !config.concurrency || config.concurrency < 1 ) config.concurrency = 1; // fetch options to use for each request let options = { headers: { 'user-agent': config.userAgent, 'cookie': config.cookie, } } console.log(`Downloading images of tags "${config.query} to folder ${config.output}"`) let parse = async () => { let posts = []; let last = ''; // last ID used, used for grabbing the next page do { let query = [`tags=${config.query}`] if ( config.limit ) query.push(`limit=${config.limit}`) if ( last ) query.push(`page=b${last}`) query = encodeURI(query.join("&")); let r = await Fetch( `${booru.urls.api}?${query}`, options ); posts = booru.posts(JSON.parse(await r.text())); let promises = []; for ( let i in posts ) { let post = booru.post(posts[i]); last = `${post.id}` cache[post.md5] = posts[i]; if ( FS.existsSync(`${config.output}${post.filename}`) ) { console.log(`Skipping existing file: ${booru.urls.posts}${post.id}`) continue; } if ( FS.existsSync(`${config.images}${post.filename}`) ) { console.log(`Copying cached file: ${booru.urls.posts}${post.id}`) FS.copyFileSync(`${config.images}${post.filename}`, `${config.output}${post.filename}`) continue; } if ( config.filter ) { let filtered = null; // nasty nested loops, dying for a go-to for ( let j in post.tags ) { tag = post.tags[j]; for ( let k in config.filters ) { let filter = config.filters[k]; if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) { filtered = tag; break; } } if ( filtered ) break; } if ( filtered ) { console.log(`Skipping filtered post: ${booru.urls.posts}${post.id}`, filtered) break; } } if ( promises.length >= config.concurrency ) { for ( let i in promises ) await promises[i]; promises = []; } if ( !config.skipSave) promises.push(Fetch(post.url, options).then(res => new Promise((resolve, reject) => { const dest = FS.createWriteStream(`${config.output}${post.filename}`); res.body.pipe(dest); dest.on('close', () => { console.log(`Downloaded: ${booru.urls.posts}${post.id}`) if ( FS.existsSync(`${config.images}`) ) { FS.copyFileSync(`${config.output}${post.filename}`, `${config.images}${post.filename}`) } resolve() }); dest.on('error', reject); })).catch((err)=>{ console.error(`Error while fetching: ${post.id}`, posts[i], err); })); } if ( config.rateLimit ) await new Promise( (resolve) => { setTimeout(resolve, config.rateLimit) } ) FS.writeFileSync(config.cache, JSON.stringify( cache, null, "\t" )) } while ( posts.length ); } parse();