stable-diffusion-utils/utils/renamer/fetch.py

import os
import sys
import re
import json
import time
import shutil
import urllib.parse
import urllib.request

def booru_e621_post( json ):
	tags = []
	for cat in json['tags']:
		for tag in json['tags'][cat]:
			tags.append(tag)

	return {
		'id': json['id'],
		'url': json['file']['url'],
		'md5': json['file']['md5'],
		'filename': f"{json['file']['md5']}.{json['file']['ext']}",
		'tags': tags,
	}


boorus = {
	'e621': {
		'urls': {
			'api': "https://e621.net/posts.json", # endpoint to grab tag info from
			'posts': "https://e621.net/posts/", # url to show post page, only for console logging
		},
		'config': {
			'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
			'cookie': None, # put your account cookie here if for whatever reason you need it
		},
		'posts': lambda json: json['posts'],
		'post': booru_e621_post
	}
}

config = {
	'booru': "e621", # booru definition to use from the above object, currently only supports e621

	'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged

	'output': './in/', # directory to save your files
	'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script

	'limit': 320, # how many posts to pull in one go

	'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',

	'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
	'filters': [ # consult the preprocess.js script for examples
		"animated", # training only supports static images
	],
	'filtersRegex': [],
}

booru = boorus[config['booru']]

for k in booru["config"]:
	if k not in config:
		config[k] = booru["config"][k]

cache = {}
try:
	with open(config['cache'], 'rb') as f:
		cache = json.loads(f.read().decode('utf-8'))
except:
	pass

args = sys.argv[1:]
if len(args) == 0:
	print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
	quit()

config['query'] = urllib.parse.quote(" ".join(args))

def parse():
	global booru, config, cache
	posts = []
	last = ''

	while True: # no do while )`;
		query = [f"tags={config['query']}"]
		if config['limit']:
			query.append(f"limit={config['limit']}")
		if last:
			query.append(f"page=b{last}")
		query = "&".join(query)

		with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
			headers = {
				'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
				'cookie': config['cookie'] or ""
			}
		)) as r:
			posts = booru['posts'](json.loads(r.read()))

		if len(posts) == 0:
			break

		for _ in range(len(posts)):
			post = booru['post'](posts[_])
			last = f"{post['id']}"
			cache[post['md5']] = posts[_]

			if os.path.exists(f"{config['output']}{post['filename']}"):
				print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
				continue

			if config["filter"]:
				filtered = False
				for tag in post['tags']:
					if tag in config['filters']:
						filtered = True
						break
					for filter in config['filtersRegex']:
						if re.search(filter, tag):
							filtered = True
							break
					if filtered:
						break
				if filtered:
					print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
					continue


			urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
			print(f"Downloaded : {booru['urls']['posts']}{post['id']}")

		if config['rateLimit']:
			time.sleep(config['rateLimit'] / 1000.0)

		with open(config['cache'], 'wb') as f:
			f.write(json.dumps(cache, indent='\t').encode('utf-8'))

if __name__ == "__main__":
	parse()
python script for the e621 downloader 2022-10-10 20:51:34 +00:00			`import os`
			`import sys`
			`import re`
			`import json`
			`import time`
			`import shutil`
			`import urllib.parse`
			`import urllib.request`

			`def booru_e621_post( json ):`
			`tags = []`
			`for cat in json['tags']:`
			`for tag in json['tags'][cat]:`
			`tags.append(tag)`

			`return {`
			`'id': json['id'],`
			`'url': json['file']['url'],`
			`'md5': json['file']['md5'],`
			`'filename': f"{json['file']['md5']}.{json['file']['ext']}",`
			`'tags': tags,`
			`}`


			`boorus = {`
			`'e621': {`
			`'urls': {`
			`'api': "https://e621.net/posts.json", # endpoint to grab tag info from`
			`'posts': "https://e621.net/posts/", # url to show post page, only for console logging`
			`},`
			`'config': {`
			`'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second`
			`'cookie': None, # put your account cookie here if for whatever reason you need it`
			`},`
			`'posts': lambda json: json['posts'],`
			`'post': booru_e621_post`
			`}`
			`}`

			`config = {`
			`'booru': "e621", # booru definition to use from the above object, currently only supports e621`

			`'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged`

			`'output': './in/', # directory to save your files`
			`'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script`

			`'limit': 320, # how many posts to pull in one go`

			`'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',`

			`'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it`
			`'filters': [ # consult the preprocess.js script for examples`
			`"animated", # training only supports static images`
			`],`
			`'filtersRegex': [],`
			`}`

			`booru = boorus[config['booru']]`

			`for k in booru["config"]:`
			`if k not in config:`
			`config[k] = booru["config"][k]`

			`cache = {}`
			`try:`
			`with open(config['cache'], 'rb') as f:`
			`cache = json.loads(f.read().decode('utf-8'))`
			`except:`
			`pass`

			`args = sys.argv[1:]`
			`if len(args) == 0:`
			print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
			`quit()`

			`config['query'] = urllib.parse.quote(" ".join(args))`

			`def parse():`
			`global booru, config, cache`
			`posts = []`
			`last = ''`

			while True: # no do while )`;
			`query = [f"tags={config['query']}"]`
			`if config['limit']:`
			`query.append(f"limit={config['limit']}")`
			`if last:`
			`query.append(f"page=b{last}")`
			`query = "&".join(query)`

			`with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",`
			`headers = {`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',`
			`'cookie': config['cookie'] or ""`
			`}`
			`)) as r:`
			`posts = booru['posts'](json.loads(r.read()))`

			`if len(posts) == 0:`
			`break`

			`for _ in range(len(posts)):`
			`post = booru['post'](posts[_])`
			`last = f"{post['id']}"`
			`cache[post['md5']] = posts[_]`

			`if os.path.exists(f"{config['output']}{post['filename']}"):`
			`print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")`
			`continue`

			`if config["filter"]:`
			`filtered = False`
			`for tag in post['tags']:`
			`if tag in config['filters']:`
			`filtered = True`
			`break`
			`for filter in config['filtersRegex']:`
			`if re.search(filter, tag):`
			`filtered = True`
			`break`
			`if filtered:`
			`break`
			`if filtered:`
			`print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")`
			`continue`


			`urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")`
			`print(f"Downloaded : {booru['urls']['posts']}{post['id']}")`

			`if config['rateLimit']:`
			`time.sleep(config['rateLimit'] / 1000.0)`

			`with open(config['cache'], 'wb') as f:`
			`f.write(json.dumps(cache, indent='\t').encode('utf-8'))`

			`if __name__ == "__main__":`
			`parse()`