stable-diffusion-utils/src/fetch.py

172 lines
5.2 KiB
Python

import os
import sys
import re
import json
import time
import shutil
import urllib.parse
import urllib.request
def booru_e621_post( json ):
tags = []
for cat in json['tags']:
for tag in json['tags'][cat]:
tags.append(tag)
# need to log in to get it, reconstruct
if json['file']['url'] is None:
json['file']['url'] = f"https://static1.e621.net/data/{json['file']['md5'][0:2]}/{json['file']['md5'][2:4]}/{json['file']['md5']}.{json['file']['ext']}"
return {
'id': json['id'],
'url': json['file']['url'],
'md5': json['file']['md5'],
'filename': f"{json['file']['md5']}.{json['file']['ext']}",
'tags': tags,
}
boorus = {
'e621': {
'urls': {
'api': "https://e621.net/posts.json", # endpoint to grab tag info from
'posts': "https://e621.net/posts/", # url to show post page, only for console logging
},
'config': {
'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
'cookie': None, # put your account cookie here if for whatever reason you need it
},
'posts': lambda json: json['posts'],
'post': booru_e621_post
}
}
config = {
'source': './data/config/fetch.json',
'booru': "e621", # booru definition to use from the above object, currently only supports e621
'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
'output': './images/downloaded/', # directory to save your files
'cache': './data/cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
'images': './images/cache/', # total cache of images, will copy if file exists here
'limit': 320, # how many posts to pull in one go
'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
'filters': [ # consult the preprocess.js script for examples
"animated", # training only supports static images
],
'filtersRegex': [],
'skipSave': False, # useful if you want to just cache your tags before running pre-process on files you already downloaded
}
if os.path.exists(config['source']):
try:
with open(config['source'], 'rb') as f:
imp = json.loads(f.read().decode('utf-8'))
for k in imp:
config[k] = imp[k]
print(f"Imported settings from {config['source']}")
except:
pass
booru = boorus[config['booru']]
for k in booru["config"]:
if k not in config:
config[k] = booru["config"][k]
cache = {}
try:
with open(config['cache'], 'rb') as f:
cache = json.loads(f.read().decode('utf-8'))
except:
pass
args = sys.argv[1]
if len(args) == 0:
print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
quit()
config['query'] = urllib.parse.quote(args)
def parse():
global booru, config, cache
posts = []
last = ''
while True: # no do while )`;
query = [f"tags={config['query']}"]
if config['limit']:
query.append(f"limit={config['limit']}")
if last:
query.append(f"page=b{last}")
query = "&".join(query)
with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'cookie': config['cookie'] or ""
}
)) as r:
posts = booru['posts'](json.loads(r.read()))
if len(posts) == 0:
break
for _ in range(len(posts)):
post = booru['post'](posts[_])
last = f"{post['id']}"
cache[post['md5']] = posts[_]
if os.path.exists(f"{config['output']}{post['filename']}"):
print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
continue
if os.path.exists(f"{config['images']}{post['filename']}"):
print(f"Copying cached file: {booru['urls']['posts']}{post['id']}")
shutil.copy(os.path.join(config['images'], post['filename']), os.path.join(config['output'], post['filename']))
continue
if post['url'] is None:
print(f"Skipping file that requires logging in: {booru['urls']['posts']}{post['id']}")
continue
if config["filter"]:
filtered = None
for tag in post['tags']:
if tag in config['filters']:
filtered = tag
break
for filter in config['filtersRegex']:
if re.search(filter, tag):
filtered = tag
break
if filtered is not None:
break
if filtered is not None:
print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
continue
if not config['skipSave']:
urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
if os.path.exists(f"{config['images']}"):
shutil.copy(os.path.join(config['output'], post['filename']), os.path.join(config['images'], post['filename']))
print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
if config['rateLimit']:
time.sleep(config['rateLimit'] / 1000.0)
with open(config['cache'], 'wb') as f:
f.write(json.dumps(cache, indent='\t').encode('utf-8'))
if __name__ == "__main__":
parse()