osu-wayback/autoSchedule/scrape.py

import urllib.request, json
import time, calendar

from objects import glob

finished = True

def run():
	global finished

	if not finished:
		print("[!] Scraper job is already running! ")
		return

	print("[!] Starting scraping job...")
	sql = glob.new_sql()
	cur = sql.cursor()

	cur.execute("SELECT max(file_version) f FROM updates")
	target = cur.fetchone()["f"]

	finished = False
	failed_streak = 0
	while not finished:
		target += 1

		attempts = 0
		extra_sleep = 0
		while attempts < glob.config["scrape"]["max_attempts"]:
			try:
				with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url:
					data = json.loads(url.read().decode())[0]
					if "url_patch" not in data.keys():
						data["url_patch"] = None
					cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" %
						(
							data["file_version"],
							data["filename"],
							data["file_hash"],
							data["filesize"],
							calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
							data["patch_id"],
							data["url_full"],
							data["url_patch"],
					))
					sql.commit()
					failed_streak = 0
					print("[Scraper] Target: %s, Status: OK" % target)
					break
			except:
				attempts += 1
				failed_streak += 1
				if glob.config["scrape"]["increase_delay_on_fail"]:
					extra_sleep = attempts
				print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts))

			time.sleep(glob.config["scrape"]["delay"] + extra_sleep)
			if failed_streak > glob.config["scrape"]["skips_until_finished"]:
				finished = True
				break

	cur.close()
	sql.close()
	print("[Scraper] Finished!")