147 lines
4.2 KiB
Python
Executable File
147 lines
4.2 KiB
Python
Executable File
import os
|
|
import sys
|
|
import re
|
|
import json
|
|
import time
|
|
import shutil
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
def booru_e621_post( json ):
|
|
tags = []
|
|
for cat in json['tags']:
|
|
for tag in json['tags'][cat]:
|
|
tags.append(tag)
|
|
|
|
# need to log in to get it, reconstruct
|
|
if json['file']['url'] is None:
|
|
json['file']['url'] = f"https://static1.e621.net/data/{json['file']['md5'][0:2]}/{json['file']['md5'][2:4]}/{json['file']['md5']}.{json['file']['ext']}"
|
|
|
|
return {
|
|
'id': json['id'],
|
|
'url': json['file']['url'],
|
|
'md5': json['file']['md5'],
|
|
'filename': f"{json['file']['md5']}.{json['file']['ext']}",
|
|
'tags': tags,
|
|
}
|
|
|
|
|
|
boorus = {
|
|
'e621': {
|
|
'urls': {
|
|
'api': "https://e621.net/posts.json", # endpoint to grab tag info from
|
|
'posts': "https://e621.net/posts/", # url to show post page, only for console logging
|
|
},
|
|
'config': {
|
|
'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
|
|
'cookie': None, # put your account cookie here if for whatever reason you need it
|
|
},
|
|
'posts': lambda json: json['posts'],
|
|
'post': booru_e621_post
|
|
}
|
|
}
|
|
|
|
config = {
|
|
'booru': "e621", # booru definition to use from the above object, currently only supports e621
|
|
|
|
'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
|
|
|
|
'output': './in/', # directory to save your files
|
|
'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
|
|
|
|
'limit': 320, # how many posts to pull in one go
|
|
|
|
'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
|
|
|
|
'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
|
|
'filters': [ # consult the preprocess.js script for examples
|
|
"animated", # training only supports static images
|
|
],
|
|
'filtersRegex': [],
|
|
}
|
|
|
|
booru = boorus[config['booru']]
|
|
|
|
for k in booru["config"]:
|
|
if k not in config:
|
|
config[k] = booru["config"][k]
|
|
|
|
cache = {}
|
|
try:
|
|
with open(config['cache'], 'rb') as f:
|
|
cache = json.loads(f.read().decode('utf-8'))
|
|
except:
|
|
pass
|
|
|
|
args = sys.argv[1:]
|
|
if len(args) == 0:
|
|
print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
|
|
quit()
|
|
|
|
config['query'] = urllib.parse.quote(" ".join(args))
|
|
|
|
def parse():
|
|
global booru, config, cache
|
|
posts = []
|
|
last = ''
|
|
|
|
while True: # no do while )`;
|
|
query = [f"tags={config['query']}"]
|
|
if config['limit']:
|
|
query.append(f"limit={config['limit']}")
|
|
if last:
|
|
query.append(f"page=b{last}")
|
|
query = "&".join(query)
|
|
|
|
with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
|
|
headers = {
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
|
|
'cookie': config['cookie'] or ""
|
|
}
|
|
)) as r:
|
|
posts = booru['posts'](json.loads(r.read()))
|
|
|
|
if len(posts) == 0:
|
|
break
|
|
|
|
for _ in range(len(posts)):
|
|
post = booru['post'](posts[_])
|
|
last = f"{post['id']}"
|
|
cache[post['md5']] = posts[_]
|
|
|
|
if os.path.exists(f"{config['output']}{post['filename']}"):
|
|
print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
|
|
continue
|
|
|
|
if post['url'] is None:
|
|
print(f"Skipping file that requires logging in: {booru['urls']['posts']}{post['id']}")
|
|
continue
|
|
|
|
if config["filter"]:
|
|
filtered = False
|
|
for tag in post['tags']:
|
|
if tag in config['filters']:
|
|
filtered = True
|
|
break
|
|
for filter in config['filtersRegex']:
|
|
if re.search(filter, tag):
|
|
filtered = True
|
|
break
|
|
if filtered:
|
|
break
|
|
if filtered:
|
|
print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
|
|
continue
|
|
|
|
|
|
urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
|
|
print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
|
|
|
|
if config['rateLimit']:
|
|
time.sleep(config['rateLimit'] / 1000.0)
|
|
|
|
with open(config['cache'], 'wb') as f:
|
|
f.write(json.dumps(cache, indent='\t').encode('utf-8'))
|
|
|
|
if __name__ == "__main__":
|
|
parse() |