python script for the e621 downloader

This commit is contained in:
mrq 2022-10-10 20:51:34 +00:00
parent d7e163e0e6
commit 1a25271de6
5 changed files with 147 additions and 10 deletions

View File

@ -42,7 +42,7 @@ If you're lacking material, the web UI's pre-processing tools to flip and split
If you rather would have finely-crafted material, you're more than welcome to manually crop and square images. A compromise for cropping an image is to expand the canvas size to square it off, and then fill the new empty space with colors to crudely blend with the background, and crudly adding color blobs to expand limbs outside the frame. It's not that imperative to do so, but it helps.
If you want to accelerate your ~~scraping~~ content acquisition, consult the fetch script under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer/).
If you want to accelerate your ~~scraping~~ content acquisition, consult the fetch script under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer/). It's a """simple but powerful""" script that can ~~scrape~~ download from e621 given a search query.
### Source Material For A Style

View File

@ -6,22 +6,20 @@ If you're targeting another booru, the same principles apply, but you'll need to
## Dependencies
While I strive to make dependencies minimal, only the pre-processsing script is available in Python, while the e621 downloading script is only available in node.js, as I'm not that strong of a python dev. It's reasonable to assume everyone has python, as it's a hard dependency for using voldy's web UI.
Python scripts have no additional dependencies, while node.js scripts require running `npm install node-fetch@2` (v2.x because I'm old and still using `require` for my includes).
The python scripts have no additional dependencies, while node.js scripts requires running `npm install node-fetch@2` (v2.x because I'm old and still using `require` for my includes).
## Fetch
**!**TODO**!** Rewrite in python, currently only available in node.js
This script is responsible for ~~scraping~~ downloading from e621 all requested files for your target subject/style.
To run, simply invoke the script with `node fetch.js [search query]`. For example: `node fetch.js "kemono -dog"` to download all non-dog posts tagged as kemono.
To run, simply invoke the script with `python fetch.py [search query]`. For example: `python fetch.py "kemono -dog"` to download all non-dog posts tagged as kemono.
In the script are some tune-ables, but the defaults are sane enough not to require any additional configuration.
If you're using another booru, extending the script to support your booru of choice is easy, as the script was configured to allow for additional booru definitions. Just reference the provided one for e621 if you need a starting point.
The python script is nearly at feature-parity with the node.js script, albeit missing the concurrency option. Please understand, not a Python dev.
## Pre-Process
The bread and butter of this repo is the preprocess script, responsible for associating your images from e621 with tags to train against during Textual Inversion.

View File

@ -39,7 +39,7 @@ let config = {
output: `./in/`, // directory to save your files
cache: `./cache.json`, // JSON file of cached tags, will speed up processing when used for the renamer script
limit: 10, // how many posts to pull in one go
limit: 320, // how many posts to pull in one go
concurrency: 4, // how many file requests to keep in flight at the same time
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
@ -69,7 +69,7 @@ args.shift();
if ( args.length ) config.query = args.join(" ");
// require a query, without it you effectively have a script to download the entirety of e621
if ( !config.query ) {
console.error("No arguments passed; example: `node fetch.js 'kemono -dog'")
console.error("No arguments passed; example: `node fetch.js 'kemono -dog'`")
return;
}
// clamp concurrency

139
utils/renamer/fetch.py Executable file
View File

@ -0,0 +1,139 @@
import os
import sys
import re
import json
import time
import shutil
import urllib.parse
import urllib.request
def booru_e621_post( json ):
tags = []
for cat in json['tags']:
for tag in json['tags'][cat]:
tags.append(tag)
return {
'id': json['id'],
'url': json['file']['url'],
'md5': json['file']['md5'],
'filename': f"{json['file']['md5']}.{json['file']['ext']}",
'tags': tags,
}
boorus = {
'e621': {
'urls': {
'api': "https://e621.net/posts.json", # endpoint to grab tag info from
'posts': "https://e621.net/posts/", # url to show post page, only for console logging
},
'config': {
'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
'cookie': None, # put your account cookie here if for whatever reason you need it
},
'posts': lambda json: json['posts'],
'post': booru_e621_post
}
}
config = {
'booru': "e621", # booru definition to use from the above object, currently only supports e621
'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
'output': './in/', # directory to save your files
'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
'limit': 320, # how many posts to pull in one go
'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
'filters': [ # consult the preprocess.js script for examples
"animated", # training only supports static images
],
'filtersRegex': [],
}
booru = boorus[config['booru']]
for k in booru["config"]:
if k not in config:
config[k] = booru["config"][k]
cache = {}
try:
with open(config['cache'], 'rb') as f:
cache = json.loads(f.read().decode('utf-8'))
except:
pass
args = sys.argv[1:]
if len(args) == 0:
print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
quit()
config['query'] = urllib.parse.quote(" ".join(args))
def parse():
global booru, config, cache
posts = []
last = ''
while True: # no do while )`;
query = [f"tags={config['query']}"]
if config['limit']:
query.append(f"limit={config['limit']}")
if last:
query.append(f"page=b{last}")
query = "&".join(query)
with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'cookie': config['cookie'] or ""
}
)) as r:
posts = booru['posts'](json.loads(r.read()))
if len(posts) == 0:
break
for _ in range(len(posts)):
post = booru['post'](posts[_])
last = f"{post['id']}"
cache[post['md5']] = posts[_]
if os.path.exists(f"{config['output']}{post['filename']}"):
print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
continue
if config["filter"]:
filtered = False
for tag in post['tags']:
if tag in config['filters']:
filtered = True
break
for filter in config['filtersRegex']:
if re.search(filter, tag):
filtered = True
break
if filtered:
break
if filtered:
print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
continue
urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
if config['rateLimit']:
time.sleep(config['rateLimit'] / 1000.0)
with open(config['cache'], 'wb') as f:
f.write(json.dumps(cache, indent='\t').encode('utf-8'))
if __name__ == "__main__":
parse()

View File

@ -115,7 +115,7 @@ def parse():
for filter in config['filtersRegex']:
if re.search(filter, tag):
should = True
continue # was break in the original script, fixed ;)
break
if should:
continue