Del old files

Scraper
2019-03-11 16:21:31 +01:00 · 2019-03-11 16:14:03 +01:00
6 changed files with 92 additions and 130 deletions
--- a/autoSchedule/init.py
+++ b/autoSchedule/init.py
@ -0,0 +1,19 @@
 from time import time, sleep
 from threading import Thread
 from . import scrape
 from objects import glob
 def schedule_loop():
 	if glob.config["scrape"]["run_on_startup"]:
 		scrape.run()
 	while True:
 		_time = time() + glob.config["scrape"]["schedule_seconds"]
 		while _time - time() > 0:
 			sleep(1)
 		scrape.run()
 thread = Thread(target = schedule_loop)
 thread.start()
--- a/autoSchedule/scrape.py
+++ b/autoSchedule/scrape.py
@ -0,0 +1,64 @@
 import urllib.request, json
 import time, calendar
 from objects import glob
 finished = True 
 def run():
 	global finished
 	if not finished:
 		print("[!] Scraper job is already running! ")
 		return
 	print("[!] Starting scraping job...")
 	sql = glob.new_sql()
 	cur = sql.cursor()
 	cur.execute("SELECT max(file_version) f FROM updates")
 	target = cur.fetchone()["f"]
 	finished = False
 	failed_streak = 0
 	while not finished:
 		target += 1
 		attempts = 0
 		extra_sleep = 0
 		while attempts < glob.config["scrape"]["max_attempts"]:
 			try:
 				with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url:
 					data = json.loads(url.read().decode())[0]
 					if "url_patch" not in data.keys():
 						data["url_patch"] = None
 					cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" %
 						(
 							data["file_version"],
 							data["filename"],
 							data["file_hash"],
 							data["filesize"],
 							calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
 							data["patch_id"],
 							data["url_full"],
 							data["url_patch"],
 					))
 					sql.commit()
 					failed_streak = 0
 					print("[Scraper] Target: %s, Status: OK" % target)
 					break
 			except:
 				attempts += 1
 				failed_streak += 1
 				if glob.config["scrape"]["increase_delay_on_fail"]:
 					extra_sleep = attempts
 				print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts))
 			time.sleep(glob.config["scrape"]["delay"] + extra_sleep)
 			if failed_streak > glob.config["scrape"]["skips_until_finished"]:
 				finished = True
 				break
 	cur.close()
 	sql.close()
 	print("[Scraper] Finished!")
--- a/config.json
+++ b/config.json
@ -3,23 +3,16 @@
        "host": "0.0.0.0",
        "port": 3003
    },
    "sql": {
        "host": "127.0.0.1",
        "user": "root",
        "passwd": "toor",
        "db": "osu-wayback"
    },
    "scrape": {
        "delay": 1,
-        "max_attempts": 3,
+		"max_attempts": 3,
-        "increase_delay_on_fail": true
+		"increase_delay_on_fail": true,
 		"skips_until_finished": 17,
 		"schedule_seconds": 68400,
 		"run_on_startup": true
 	},
 	"downloader": {
 		"download_folder": "/home/wayback/files"
    },
    "zipper": {
        "temp_folder": "/home/wayback/tmp",
        "output_folder": "/home/wayback/archive"
 	},
 	"threads": 4
 }
--- a/main.py
+++ b/main.py
@ -76,4 +76,7 @@ if __name__ == "__main__":
 	print("To stop server press CTRL + C")
 	glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"])
 	tornado.log.enable_pretty_logging()
-	tornado.ioloop.IOLoop.instance().start()
+
 	import autoSchedule
 	tornado.ioloop.IOLoop.instance().start()
--- a/scrape.py
+++ b/scrape.py
@ -1,67 +0,0 @@
 #Used for scraping osu files from osu.ppy.sh and storing them into self database
 import urllib.request, json
 import MySQLdb
 import MySQLdb.cursors
 import time, calendar
 import atexit
 finished = False
 with open("config.json", "r") as f:
    config = json.load(f)
 with open("memory.json", "r") as f:
    memory = json.load(f)
 sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
 cur = sql.cursor()
 def on_close():
    with open("memory.json", "w") as f:
        json.dump(memory, f)
    print("Closing...")
 atexit.register(on_close)
 failed_streak = 0
 while not finished:
    target = memory["scrape"]["last"] + 1
    attempts = 0
    completed = False
    extra_sleep = 0
    while attempts < config["scrape"]["max_attempts"] and not completed:
        try:
            with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url:
                data = json.loads(url.read().decode())[0]
                if "url_patch" not in data.keys():
                    data["url_patch"] = None
                cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
                    [
                        data["file_version"],
                        data["filename"],
                        data["file_hash"],
                        data["filesize"],
                        calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
                        data["patch_id"],
                        data["url_full"],
                        data["url_patch"]
                    ])
                sql.commit()
                completed = True
                failed_streak = 0
                print("target: {}, status: OK".format(target))
        except:
            if target not in memory["scrape"]["failed"]:
                memory["scrape"]["failed"].append(target)
            attempts += 1
            failed_streak += 1
            if config["scrape"]["increase_delay_on_fail"]:
                extra_sleep = attempts
            print("target: {}, status: FAILED, attempt: {}".format(target, attempts))
        time.sleep(config["scrape"]["delay"] + extra_sleep)
        if failed_streak > 100:
            exit()
    memory["scrape"]["last"] = target
--- a/zipper.py
+++ b/zipper.py
@ -1,50 +0,0 @@
 #Used to zip all the files into new local folders after downloader is done
 import urllib.request, json
 import MySQLdb
 import MySQLdb.cursors
 import os
 import atexit
 with open("config.json", "r") as f:
    config = json.load(f)
 with open("memory.json", "r") as f:
    memory = json.load(f)
 sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
 cur = sql.cursor()
 def on_close():
    with open("memory.json", "w") as f:
        json.dump(memory, f)
    print("Closing...")
 atexit.register(on_close)
 cur.execute("SELECT file_version,filename,file_hash,url_full FROM updates")
 data = cur.fetchall()
 # Remove already downloaded files (checked from memory.json)
 data = data[memory["zipper"]["last"]:]
 # Unfinished - replace with zipper code
 """
 for row in data:
 	try:
 		print("Downloading {} with id {}".format(row["filename"], row["file_version"]))
 		urllib.request.urlretrieve(
 				row["url_full"],
 				os.path.join(
 					config["downloader"]["download_folder"],
 					row["filename"],
 					"f_" + row["file_hash"]
 				)
 			)
 		print("Done.")
 	except Exception as e:
 		memory["downloader"]["failed"].append(row["file_version"])
 		print("Error downloading file {}: {}".format(row["file_version"], e))
 	memory["downloader"]["last"] += 1
 """
Author	SHA1	Message	Date
Sunpy	05ecccf0da	Del old files	2019-03-11 16:21:31 +01:00
Sunpy	d75985651f	Scraper	2019-03-11 16:14:03 +01:00