stable-diffusion-utils/src/fetch.py

import os
import sys
import re
import json
import time
import shutil
import urllib.parse
import urllib.request

def booru_e621_post( json ):
	tags = []
	for cat in json['tags']:
		for tag in json['tags'][cat]:
			tags.append(tag)

	# need to log in to get it, reconstruct
	if json['file']['url'] is None:
		json['file']['url'] = f"https://static1.e621.net/data/{json['file']['md5'][0:2]}/{json['file']['md5'][2:4]}/{json['file']['md5']}.{json['file']['ext']}"

	return {
		'id': json['id'],
		'url': json['file']['url'],
		'md5': json['file']['md5'],
		'filename': f"{json['file']['md5']}.{json['file']['ext']}",
		'tags': tags,
	}


boorus = {
	'e621': {
		'urls': {
			'api': "https://e621.net/posts.json", # endpoint to grab tag info from
			'posts': "https://e621.net/posts/", # url to show post page, only for console logging
		},
		'config': {
			'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
			'cookie': None, # put your account cookie here if for whatever reason you need it
		},
		'posts': lambda json: json['posts'],
		'post': booru_e621_post
	}
}

config = {
	'source': './data/config/fetch.json',

	'booru': "e621", # booru definition to use from the above object, currently only supports e621

	'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged

	'output': './images/downloaded/', # directory to save your files
	'cache': './data/cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
	'images': './images/cache/', # total cache of images, will copy if file exists here

	'limit': 320, # how many posts to pull in one go

	'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',

	'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
	'filters': [ # consult the preprocess.js script for examples
		"animated", # training only supports static images
	],
	'filtersRegex': [],

	'skipSave': False, # useful if you want to just cache your tags before running pre-process on files you already downloaded
}

if os.path.exists(config['source']):
	try:
		with open(config['source'], 'rb') as f:
			imp = json.loads(f.read().decode('utf-8'))
			for k in imp:
				config[k] = imp[k]
			print(f"Imported settings from {config['source']}")
	except:
		pass

booru = boorus[config['booru']]

for k in booru["config"]:
	if k not in config:
		config[k] = booru["config"][k]

cache = {}
try:
	with open(config['cache'], 'rb') as f:
		cache = json.loads(f.read().decode('utf-8'))
except:
	pass

args = sys.argv[1]

if len(args) == 0:
	print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
	quit()

config['query'] = urllib.parse.quote(args)

def parse():
	global booru, config, cache
	posts = []
	last = ''

	while True: # no do while )`;
		query = [f"tags={config['query']}"]
		if config['limit']:
			query.append(f"limit={config['limit']}")
		if last:
			query.append(f"page=b{last}")
		query = "&".join(query)

		with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
			headers = {
				'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
				'cookie': config['cookie'] or ""
			}
		)) as r:
			posts = booru['posts'](json.loads(r.read()))

		if len(posts) == 0:
			break

		for _ in range(len(posts)):
			post = booru['post'](posts[_])
			last = f"{post['id']}"
			cache[post['md5']] = posts[_]

			if os.path.exists(f"{config['output']}{post['filename']}"):
				print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
				continue

			if os.path.exists(f"{config['images']}{post['filename']}"):
				print(f"Copying cached file: {booru['urls']['posts']}{post['id']}")
				shutil.copy(os.path.join(config['images'], post['filename']), os.path.join(config['output'], post['filename']))
				continue

			if post['url'] is None:
				print(f"Skipping file that requires logging in: {booru['urls']['posts']}{post['id']}")
				continue

			if config["filter"]:
				filtered = None
				for tag in post['tags']:
					if tag in config['filters']:
						filtered = tag
						break
					for filter in config['filtersRegex']:
						if re.search(filter, tag):
							filtered = tag
							break
					if filtered is not None:
						break
				if filtered is not None:
					print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
					continue


			if not config['skipSave']:
				urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
				if os.path.exists(f"{config['images']}"):
					shutil.copy(os.path.join(config['output'], post['filename']), os.path.join(config['images'], post['filename']))

				print(f"Downloaded : {booru['urls']['posts']}{post['id']}")

		if config['rateLimit']:
			time.sleep(config['rateLimit'] / 1000.0)

		with open(config['cache'], 'wb') as f:
			f.write(json.dumps(cache, indent='\t').encode('utf-8'))

if __name__ == "__main__":
	parse()