small flag added to skip saving in the fetch scripts, useful to grab tags before running the preprocess script on files already downloaded

This commit is contained in:
mrq 2022-10-12 20:05:54 +00:00
parent f000aab55f
commit caa1a1707b
2 changed files with 28 additions and 23 deletions

View File

@ -53,6 +53,8 @@ let config = {
filters: [ // consult the preprocess.js script for examples filters: [ // consult the preprocess.js script for examples
"animated", // training only supports static images "animated", // training only supports static images
], ],
skipSave: false, // useful if you want to just cache your tags before running pre-process on files you already downloaded
} }
let booru = boorus[config.booru]; let booru = boorus[config.booru];
@ -112,22 +114,21 @@ let parse = async () => {
} }
if ( config.filter ) { if ( config.filter ) {
let filtered = false; let filtered = null;
// nasty nested loops, dying for a go-to // nasty nested loops, dying for a go-to
for ( let j in post.tags ) { for ( let j in post.tags ) {
let tag = post.tags[j]; tag = post.tags[j];
for ( let k in config.filters ) { for ( let k in config.filters ) {
let filter = config.filters[k]; let filter = config.filters[k];
if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) { if ( filter === tag || ( filter instanceof RegExp && tag.match(filter) ) ) {
filtered = true; filtered = tag;
break; break;
} }
} }
if ( filtered ) break; if ( filtered ) break;
} }
if ( filtered ) { if ( filtered ) {
console.log(`Skipping filtered post: ${booru.urls.posts}${post.id}`, tag) console.log(`Skipping filtered post: ${booru.urls.posts}${post.id}`, filtered)
break; break;
} }
} }
@ -139,17 +140,18 @@ let parse = async () => {
promises.push(Fetch(post.url, options).then(res => new Promise((resolve, reject) => { if ( !config.skipSave)
const dest = FS.createWriteStream(`${config.output}${post.filename}`); promises.push(Fetch(post.url, options).then(res => new Promise((resolve, reject) => {
res.body.pipe(dest); const dest = FS.createWriteStream(`${config.output}${post.filename}`);
dest.on('close', () => { res.body.pipe(dest);
console.log(`Downloaded: ${booru.urls.posts}${post.id}`) dest.on('close', () => {
resolve() console.log(`Downloaded: ${booru.urls.posts}${post.id}`)
}); resolve()
dest.on('error', reject); });
})).catch((err)=>{ dest.on('error', reject);
console.error(`Error while fetching: ${post.id}`, posts[i], err); })).catch((err)=>{
})); console.error(`Error while fetching: ${post.id}`, posts[i], err);
}));
} }
if ( config.rateLimit ) await new Promise( (resolve) => { if ( config.rateLimit ) await new Promise( (resolve) => {

View File

@ -58,6 +58,8 @@ config = {
"animated", # training only supports static images "animated", # training only supports static images
], ],
'filtersRegex': [], 'filtersRegex': [],
'skipSave': False, # useful if you want to just cache your tags before running pre-process on files you already downloaded
} }
booru = boorus[config['booru']] booru = boorus[config['booru']]
@ -118,24 +120,25 @@ def parse():
continue continue
if config["filter"]: if config["filter"]:
filtered = False filtered = None
for tag in post['tags']: for tag in post['tags']:
if tag in config['filters']: if tag in config['filters']:
filtered = True filtered = tag
break break
for filter in config['filtersRegex']: for filter in config['filtersRegex']:
if re.search(filter, tag): if re.search(filter, tag):
filtered = True filtered = tag
break break
if filtered: if filtered is not None:
break break
if filtered: if filtered is not None:
print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}") print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
continue continue
urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}") if not config['skipSave']:
print(f"Downloaded : {booru['urls']['posts']}{post['id']}") urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
if config['rateLimit']: if config['rateLimit']:
time.sleep(config['rateLimit'] / 1000.0) time.sleep(config['rateLimit'] / 1000.0)