python script for the e621 downloader

2022-10-10 20:51:34 +00:00 · 2022-10-10 20:51:34 +00:00 · 1a25271de6
commit 1a25271de6
parent d7e163e0e6
5 changed files with 147 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -42,7 +42,7 @@ If you're lacking material, the web UI's pre-processing tools to flip and split

 If you rather would have finely-crafted material, you're more than welcome to manually crop and square images. A compromise for cropping an image is to expand the canvas size to square it off, and then fill the new empty space with colors to crudely blend with the background, and crudly adding color blobs to expand limbs outside the frame. It's not that imperative to do so, but it helps.

-If you want to accelerate your ~~scraping~~ content acquisition, consult the fetch script under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer/).
+If you want to accelerate your ~~scraping~~ content acquisition, consult the fetch script under [`./utils/renamer/`](https://git.coom.tech/mrq/stable-diffusion-utils/src/branch/master/utils/renamer/). It's a """simple but powerful""" script that can ~~scrape~~ download from e621 given a search query.

 ### Source Material For A Style

--- a/utils/renamer/README.md
+++ b/utils/renamer/README.md
@ -6,22 +6,20 @@ If you're targeting another booru, the same principles apply, but you'll need to

 ## Dependencies

-While I strive to make dependencies minimal, only the pre-processsing script is available in Python, while the e621 downloading script is only available in node.js, as I'm not that strong of a python dev. It's reasonable to assume everyone has python, as it's a hard dependency for using voldy's web UI.
-
-Python scripts have no additional dependencies, while node.js scripts require running `npm install node-fetch@2` (v2.x because I'm old and still using `require` for my includes).
+The python scripts have no additional dependencies, while node.js scripts requires running `npm install node-fetch@2` (v2.x because I'm old and still using `require` for my includes).

 ## Fetch

-**!**TODO**!** Rewrite in python, currently only available in node.js
-
 This script is responsible for ~~scraping~~ downloading from e621 all requested files for your target subject/style.

-To run, simply invoke the script with `node fetch.js [search query]`. For example: `node fetch.js "kemono -dog"` to download all non-dog posts tagged as kemono.
+To run, simply invoke the script with `python fetch.py [search query]`. For example: `python fetch.py "kemono -dog"` to download all non-dog posts tagged as kemono.

 In the script are some tune-ables, but the defaults are sane enough not to require any additional configuration.

 If you're using another booru, extending the script to support your booru of choice is easy, as the script was configured to allow for additional booru definitions. Just reference the provided one for e621 if you need a starting point.

+The python script is nearly at feature-parity with the node.js script, albeit missing the concurrency option. Please understand, not a Python dev.
+
 ## Pre-Process

 The bread and butter of this repo is the preprocess script, responsible for associating your images from e621 with tags to train against during Textual Inversion.
--- a/utils/renamer/fetch.js
+++ b/utils/renamer/fetch.js
@ -39,7 +39,7 @@ let config = {
 	output: `./in/`, // directory to save your files
 	cache: `./cache.json`, // JSON file of cached tags, will speed up processing when used for the renamer script

-	limit: 10, // how many posts to pull in one go
+	limit: 320, // how many posts to pull in one go
 	concurrency: 4, // how many file requests to keep in flight at the same time

 	userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
@ -69,7 +69,7 @@ args.shift();
 if ( args.length ) config.query = args.join(" ");
 // require a query, without it you effectively have a script to download the entirety of e621
 if ( !config.query ) {
-	console.error("No arguments passed; example: `node fetch.js 'kemono -dog'")
+	console.error("No arguments passed; example: `node fetch.js 'kemono -dog'`")
 	return;
 }
 // clamp concurrency
--- a/utils/renamer/fetch.py
+++ b/utils/renamer/fetch.py
@ -0,0 +1,139 @@
+import os
+import sys
+import re
+import json
+import time
+import shutil
+import urllib.parse
+import urllib.request
+
+def booru_e621_post( json ):
+	tags = []
+	for cat in json['tags']:
+		for tag in json['tags'][cat]:
+			tags.append(tag)
+
+	return {
+		'id': json['id'],
+		'url': json['file']['url'],
+		'md5': json['file']['md5'],
+		'filename': f"{json['file']['md5']}.{json['file']['ext']}",
+		'tags': tags,
+	}
+
+
+boorus = {
+	'e621': {
+		'urls': {
+			'api': "https://e621.net/posts.json", # endpoint to grab tag info from
+			'posts': "https://e621.net/posts/", # url to show post page, only for console logging
+		},
+		'config': {
+			'rateLimit': 500, # time to wait between requests, in milliseconds, e621 imposes a rate limit of 2 requests per second
+			'cookie': None, # put your account cookie here if for whatever reason you need it
+		},
+		'posts': lambda json: json['posts'],
+		'post': booru_e621_post
+	}
+}
+
+config = {
+	'booru': "e621", # booru definition to use from the above object, currently only supports e621
+
+	'query': '', # example query if no argument is passed, kept empty so the script can scream at you for not having it tagged
+
+	'output': './in/', # directory to save your files
+	'cache': './cache.json', # JSON file of cached tags, will speed up processing when used for the renamer script
+
+	'limit': 320, # how many posts to pull in one go
+
+	'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
+
+	'filter': True, # sort of redundant given you can "filter" with the query, but e621 has a query limit, so here you can bypass it
+	'filters': [ # consult the preprocess.js script for examples
+		"animated", # training only supports static images
+	],
+	'filtersRegex': [],
+}
+
+booru = boorus[config['booru']]
+
+for k in booru["config"]:
+	if k not in config:
+		config[k] = booru["config"][k]
+
+cache = {}
+try:
+	with open(config['cache'], 'rb') as f:
+		cache = json.loads(f.read().decode('utf-8'))
+except:
+	pass
+
+args = sys.argv[1:]
+if len(args) == 0:
+	print('No arguments passed, example: `python3 fetch.py \"kemono -dog\"`')
+	quit()
+
+config['query'] = urllib.parse.quote(" ".join(args))
+
+def parse():
+	global booru, config, cache
+	posts = []
+	last = ''
+
+	while True: # no do while )`;
+		query = [f"tags={config['query']}"]
+		if config['limit']:
+			query.append(f"limit={config['limit']}")
+		if last:
+			query.append(f"page=b{last}")
+		query = "&".join(query)
+
+		with urllib.request.urlopen(urllib.request.Request(f"{booru['urls']['api']}?{query}",
+			headers = {
+				'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
+				'cookie': config['cookie'] or ""
+			}
+		)) as r:
+			posts = booru['posts'](json.loads(r.read()))
+
+		if len(posts) == 0:
+			break
+
+		for _ in range(len(posts)):
+			post = booru['post'](posts[_])
+			last = f"{post['id']}"
+			cache[post['md5']] = posts[_]
+
+			if os.path.exists(f"{config['output']}{post['filename']}"):
+				print(f"Skipping existing file: {booru['urls']['posts']}{post['id']}")
+				continue
+
+			if config["filter"]:
+				filtered = False
+				for tag in post['tags']:
+					if tag in config['filters']:
+						filtered = True
+						break
+					for filter in config['filtersRegex']:
+						if re.search(filter, tag):
+							filtered = True
+							break
+					if filtered:
+						break
+				if filtered:
+					print(f"Skipping filtered post: {booru['urls']['posts']}{post['id']} {tag}")
+					continue
+
+
+			urllib.request.urlretrieve(post['url'], f"{config['output']}{post['filename']}")
+			print(f"Downloaded : {booru['urls']['posts']}{post['id']}")
+
+		if config['rateLimit']:
+			time.sleep(config['rateLimit'] / 1000.0)
+
+		with open(config['cache'], 'wb') as f:
+			f.write(json.dumps(cache, indent='\t').encode('utf-8'))
+
+if __name__ == "__main__":
+	parse()
--- a/utils/renamer/preprocess.py
+++ b/utils/renamer/preprocess.py
@ -115,7 +115,7 @@ def parse():
 						for filter in config['filtersRegex']:
 							if re.search(filter, tag):
 								should = True
-								continue # was break in the original script, fixed ;)
+								break
 						if should:
 							continue