import os import sys import re import json import time import shutil import urllib.parse import urllib.request def booru_e621_post( json ): tags = [] for cat in json['tags']: for tag in json['tags'][cat]: tags.append(tag) # need to log in to get it, reconstruct if json['file']['url'] is None: json['file']['url'] = f"https://static1.e621.net/data/{json['file']['md5'][0:2]}/{json['file']['md5'][2:4]}/{json['file']['md5']}.{json['file']['ext']}" return { 'id': json['id'], 'url': json['file']['url'], 'md5': json['file']['md5'], 'filename': f"{json['file']['md5']}.{json['file']['ext']}", 'tags': tags, } boorus = { 'e621': { 'urls': { 'api': "https://e621.net/posts.json", # endpoint to grab tag info from 'posts': "https://e621.net/posts/", # url to show post page, only for console logging }, 'config': { 'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second 'cookie': None, # put your account cookie here if for whatever reason you need it }, 'posts': lambda json: json['posts'], 'post': booru_e621_post } } config = { 'booru': "e621", # booru definition to use from the above object, currently only supports e621 'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged 'output': './in/', # directory to save your files 'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script 'limit': 320, # how many posts to pull in one go 'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it 'filters': [ # consult the preprocess.js script for examples "animated", # training only supports static images ], 'filtersRegex': [], } booru = boorus[config['booru']] for k in booru["config"]: if k not in config: config[k] = booru["config"][k] cache = {} try: with open(config['cache'], 'rb') as f: cache = json.loads(f.read().decode('utf-8')) except: pass args = sys.argv[1:] if len(args) == 0: print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`') quit() config['query'] = urllib.parse.quote(" ".join(args)) def parse(): global booru, config, cache posts = [] last = '' while True: # no do while )`; query = [f"tags={config['query']}"] if config['limit']: query.append(f"limit={config['limit']}") if last: query.append(f"page=b{last}") query = "&".join(query) with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}", headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'cookie': config['cookie'] or "" } )) as r: posts = booru['posts'](json.loads(r.read())) if len(posts) == 0: break for _ in range(len(posts)): post = booru['post'](posts[_]) last = f"{post['id']}" cache[post['md5']] = posts[_] if os.path.exists(f"{config['output']}{post['filename']}"): print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}") continue if post['url'] is None: print(f"Skipping file that requires logging in: {booru['urls']['posts']}{post['id']}") continue if config["filter"]: filtered = False for tag in post['tags']: if tag in config['filters']: filtered = True break for filter in config['filtersRegex']: if re.search(filter, tag): filtered = True break if filtered: break if filtered: print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}") continue urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}") print(f"Downloaded : {booru['urls']['posts']}{post['id']}") if config['rateLimit']: time.sleep(config['rateLimit'] / 1000.0) with open(config['cache'], 'wb') as f: f.write(json.dumps(cache, indent='\t').encode('utf-8')) if __name__ == "__main__": parse()