139 lines
3.9 KiB
Python
139 lines
3.9 KiB
Python
|
import os
|
||
|
import sys
|
||
|
import re
|
||
|
import json
|
||
|
import time
|
||
|
import shutil
|
||
|
import urllib.parse
|
||
|
import urllib.request
|
||
|
|
||
|
def booru_e621_post( json ):
|
||
|
tags = []
|
||
|
for cat in json['tags']:
|
||
|
for tag in json['tags'][cat]:
|
||
|
tags.append(tag)
|
||
|
|
||
|
return {
|
||
|
'id': json['id'],
|
||
|
'url': json['file']['url'],
|
||
|
'md5': json['file']['md5'],
|
||
|
'filename': f"{json['file']['md5']}.{json['file']['ext']}",
|
||
|
'tags': tags,
|
||
|
}
|
||
|
|
||
|
|
||
|
boorus = {
|
||
|
'e621': {
|
||
|
'urls': {
|
||
|
'api': "https://e621.net/posts.json", # endpoint to grab tag info from
|
||
|
'posts': "https://e621.net/posts/", # url to show post page, only for console logging
|
||
|
},
|
||
|
'config': {
|
||
|
'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
|
||
|
'cookie': None, # put your account cookie here if for whatever reason you need it
|
||
|
},
|
||
|
'posts': lambda json: json['posts'],
|
||
|
'post': booru_e621_post
|
||
|
}
|
||
|
}
|
||
|
|
||
|
config = {
|
||
|
'booru': "e621", # booru definition to use from the above object, currently only supports e621
|
||
|
|
||
|
'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
|
||
|
|
||
|
'output': './in/', # directory to save your files
|
||
|
'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
|
||
|
|
||
|
'limit': 320, # how many posts to pull in one go
|
||
|
|
||
|
'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
|
||
|
|
||
|
'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
|
||
|
'filters': [ # consult the preprocess.js script for examples
|
||
|
"animated", # training only supports static images
|
||
|
],
|
||
|
'filtersRegex': [],
|
||
|
}
|
||
|
|
||
|
booru = boorus[config['booru']]
|
||
|
|
||
|
for k in booru["config"]:
|
||
|
if k not in config:
|
||
|
config[k] = booru["config"][k]
|
||
|
|
||
|
cache = {}
|
||
|
try:
|
||
|
with open(config['cache'], 'rb') as f:
|
||
|
cache = json.loads(f.read().decode('utf-8'))
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
args = sys.argv[1:]
|
||
|
if len(args) == 0:
|
||
|
print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
|
||
|
quit()
|
||
|
|
||
|
config['query'] = urllib.parse.quote(" ".join(args))
|
||
|
|
||
|
def parse():
|
||
|
global booru, config, cache
|
||
|
posts = []
|
||
|
last = ''
|
||
|
|
||
|
while True: # no do while )`;
|
||
|
query = [f"tags={config['query']}"]
|
||
|
if config['limit']:
|
||
|
query.append(f"limit={config['limit']}")
|
||
|
if last:
|
||
|
query.append(f"page=b{last}")
|
||
|
query = "&".join(query)
|
||
|
|
||
|
with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
|
||
|
headers = {
|
||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
|
||
|
'cookie': config['cookie'] or ""
|
||
|
}
|
||
|
)) as r:
|
||
|
posts = booru['posts'](json.loads(r.read()))
|
||
|
|
||
|
if len(posts) == 0:
|
||
|
break
|
||
|
|
||
|
for _ in range(len(posts)):
|
||
|
post = booru['post'](posts[_])
|
||
|
last = f"{post['id']}"
|
||
|
cache[post['md5']] = posts[_]
|
||
|
|
||
|
if os.path.exists(f"{config['output']}{post['filename']}"):
|
||
|
print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
|
||
|
continue
|
||
|
|
||
|
if config["filter"]:
|
||
|
filtered = False
|
||
|
for tag in post['tags']:
|
||
|
if tag in config['filters']:
|
||
|
filtered = True
|
||
|
break
|
||
|
for filter in config['filtersRegex']:
|
||
|
if re.search(filter, tag):
|
||
|
filtered = True
|
||
|
break
|
||
|
if filtered:
|
||
|
break
|
||
|
if filtered:
|
||
|
print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
|
||
|
continue
|
||
|
|
||
|
|
||
|
urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
|
||
|
print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
|
||
|
|
||
|
if config['rateLimit']:
|
||
|
time.sleep(config['rateLimit'] / 1000.0)
|
||
|
|
||
|
with open(config['cache'], 'wb') as f:
|
||
|
f.write(json.dumps(cache, indent='\t').encode('utf-8'))
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parse()
|